- Improvements to enable deployment in the cloud, mainly changing file access to Minio
- Improvements on RAG logging, and some debugging in that area
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@ docker/db/postgresql/
|
||||
docker/db/redis/
|
||||
docker/logs/
|
||||
docker/tenant_files/
|
||||
/docker/minio/
|
||||
|
||||
@@ -11,6 +11,7 @@ from flask_session import Session
|
||||
from flask_wtf import CSRFProtect
|
||||
|
||||
from .utils.key_encryption import JosKMSClient
|
||||
from .utils.minio_utils import MinioClient
|
||||
|
||||
# Create extensions
|
||||
db = SQLAlchemy()
|
||||
@@ -26,3 +27,4 @@ jwt = JWTManager()
|
||||
session = Session()
|
||||
|
||||
kms_client = JosKMSClient.from_service_account_json('config/gc_sa_eveai.json')
|
||||
minio_client = MinioClient()
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
from sqlalchemy import func, and_, or_
|
||||
from sqlalchemy import func, and_, or_, desc
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Any, Dict
|
||||
@@ -20,12 +20,56 @@ class EveAIRetriever(BaseRetriever):
|
||||
self.tenant_info = tenant_info
|
||||
|
||||
def _get_relevant_documents(self, query: str):
|
||||
|
||||
|
||||
|
||||
current_app.logger.debug(f'Retrieving relevant documents for query: {query}')
|
||||
query_embedding = self._get_query_embedding(query)
|
||||
db_class = self.model_variables['embedding_db_model']
|
||||
similarity_threshold = self.model_variables['similarity_threshold']
|
||||
k = self.model_variables['k']
|
||||
|
||||
if self.tenant_info['rag_tuning']:
|
||||
try:
|
||||
current_date = get_date_in_timezone(self.tenant_info['timezone'])
|
||||
current_app.rag_tuning_logger.debug(f'Current date: {current_date}\n')
|
||||
|
||||
# Debug query to show similarity for all valid documents (without chunk text)
|
||||
debug_query = (
|
||||
db.session.query(
|
||||
Document.id.label('document_id'),
|
||||
DocumentVersion.id.label('version_id'),
|
||||
db_class.id.label('embedding_id'),
|
||||
(1 - db_class.embedding.cosine_distance(query_embedding)).label('similarity')
|
||||
)
|
||||
.join(DocumentVersion, db_class.doc_vers_id == DocumentVersion.id)
|
||||
.join(Document, DocumentVersion.doc_id == Document.id)
|
||||
.filter(
|
||||
or_(Document.valid_from.is_(None), func.date(Document.valid_from) <= current_date),
|
||||
or_(Document.valid_to.is_(None), func.date(Document.valid_to) >= current_date)
|
||||
)
|
||||
.order_by(desc('similarity'))
|
||||
)
|
||||
|
||||
debug_results = debug_query.all()
|
||||
|
||||
current_app.logger.debug("Debug: Similarity for all valid documents:")
|
||||
for row in debug_results:
|
||||
current_app.rag_tuning_logger.debug(f"Doc ID: {row.document_id}, "
|
||||
f"Version ID: {row.version_id}, "
|
||||
f"Embedding ID: {row.embedding_id}, "
|
||||
f"Similarity: {row.similarity}")
|
||||
current_app.rag_tuning_logger.debug(f'---------------------------------------\n')
|
||||
except SQLAlchemyError as e:
|
||||
current_app.logger.error(f'Error generating overview: {e}')
|
||||
db.session.rollback()
|
||||
|
||||
if self.tenant_info['rag_tuning']:
|
||||
current_app.rag_tuning_logger.debug(f'Parameters for Retrieval of documents: \n')
|
||||
current_app.rag_tuning_logger.debug(f'Similarity Threshold: {similarity_threshold}\n')
|
||||
current_app.rag_tuning_logger.debug(f'K: {k}\n')
|
||||
current_app.rag_tuning_logger.debug(f'---------------------------------------\n')
|
||||
|
||||
try:
|
||||
current_date = get_date_in_timezone(self.tenant_info['timezone'])
|
||||
# Subquery to find the latest version of each document
|
||||
@@ -40,24 +84,31 @@ class EveAIRetriever(BaseRetriever):
|
||||
# Main query to filter embeddings
|
||||
query_obj = (
|
||||
db.session.query(db_class,
|
||||
db_class.embedding.cosine_distance(query_embedding).label('distance'))
|
||||
(1 - db_class.embedding.cosine_distance(query_embedding)).label('similarity'))
|
||||
.join(DocumentVersion, db_class.doc_vers_id == DocumentVersion.id)
|
||||
.join(Document, DocumentVersion.doc_id == Document.id)
|
||||
.join(subquery, DocumentVersion.id == subquery.c.latest_version_id)
|
||||
.filter(
|
||||
or_(Document.valid_from.is_(None), Document.valid_from <= current_date),
|
||||
or_(Document.valid_to.is_(None), Document.valid_to >= current_date),
|
||||
db_class.embedding.cosine_distance(query_embedding) < similarity_threshold
|
||||
or_(Document.valid_from.is_(None), func.date(Document.valid_from) <= current_date),
|
||||
or_(Document.valid_to.is_(None), func.date(Document.valid_to) >= current_date),
|
||||
(1 - db_class.embedding.cosine_distance(query_embedding)) > similarity_threshold
|
||||
)
|
||||
.order_by('distance')
|
||||
.order_by(desc('similarity'))
|
||||
.limit(k)
|
||||
)
|
||||
|
||||
if self.tenant_info['rag_tuning']:
|
||||
current_app.rag_tuning_logger.debug(f'Query executed for Retrieval of documents: \n')
|
||||
current_app.rag_tuning_logger.debug(f'{query_obj.statement}\n')
|
||||
current_app.rag_tuning_logger.debug(f'---------------------------------------\n')
|
||||
|
||||
res = query_obj.all()
|
||||
|
||||
if self.tenant_info['rag_tuning']:
|
||||
current_app.rag_tuning_logger.debug(f'Retrieved {len(res)} relevant documents')
|
||||
current_app.rag_tuning_logger.debug(f'---------------------------------------')
|
||||
current_app.rag_tuning_logger.debug(f'Retrieved {len(res)} relevant documents \n')
|
||||
current_app.rag_tuning_logger.debug(f'Data retrieved: \n')
|
||||
current_app.rag_tuning_logger.debug(f'{res}\n')
|
||||
current_app.rag_tuning_logger.debug(f'---------------------------------------\n')
|
||||
|
||||
result = []
|
||||
for doc in res:
|
||||
|
||||
@@ -82,7 +82,6 @@ class Tenant(db.Model):
|
||||
'html_excluded_elements': self.html_excluded_elements,
|
||||
'min_chunk_size': self.min_chunk_size,
|
||||
'max_chunk_size': self.max_chunk_size,
|
||||
'es_k'
|
||||
'es_k': self.es_k,
|
||||
'es_similarity_threshold': self.es_similarity_threshold,
|
||||
'chat_RAG_temperature': self.chat_RAG_temperature,
|
||||
|
||||
86
common/utils/minio_utils.py
Normal file
86
common/utils/minio_utils.py
Normal file
@@ -0,0 +1,86 @@
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
from flask import Flask
|
||||
import io
|
||||
from werkzeug.datastructures import FileStorage
|
||||
|
||||
class MinioClient:
|
||||
def __init__(self):
|
||||
self.client = None
|
||||
|
||||
def init_app(self, app: Flask):
|
||||
self.client = Minio(
|
||||
app.config['MINIO_ENDPOINT'],
|
||||
access_key=app.config['MINIO_ACCESS_KEY'],
|
||||
secret_key=app.config['MINIO_SECRET_KEY'],
|
||||
secure=app.config.get('MINIO_USE_HTTPS', False)
|
||||
)
|
||||
app.logger.info(f"MinIO client initialized with endpoint: {app.config['MINIO_ENDPOINT']}")
|
||||
|
||||
def generate_bucket_name(self, tenant_id):
|
||||
return f"tenant-{tenant_id}-bucket"
|
||||
|
||||
def create_tenant_bucket(self, tenant_id):
|
||||
bucket_name = self.generate_bucket_name(tenant_id)
|
||||
try:
|
||||
if not self.client.bucket_exists(bucket_name):
|
||||
self.client.make_bucket(bucket_name)
|
||||
return bucket_name
|
||||
return bucket_name
|
||||
except S3Error as err:
|
||||
raise Exception(f"Error occurred while creating bucket: {err}")
|
||||
|
||||
def generate_object_name(self, document_id, language, version_id, filename):
|
||||
return f"{document_id}/{language}/{version_id}/{filename}"
|
||||
|
||||
def upload_document_file(self, tenant_id, document_id, language, version_id, filename, file_data):
|
||||
bucket_name = self.generate_bucket_name(tenant_id)
|
||||
object_name = self.generate_object_name(document_id, language, version_id, filename)
|
||||
|
||||
try:
|
||||
if isinstance(file_data, FileStorage):
|
||||
file_data = file_data.read()
|
||||
elif isinstance(file_data, io.BytesIO):
|
||||
file_data = file_data.getvalue()
|
||||
elif isinstance(file_data, str):
|
||||
file_data = file_data.encode('utf-8')
|
||||
elif not isinstance(file_data, bytes):
|
||||
raise TypeError('Unsupported file type. Expected FileStorage, BytesIO, str, or bytes.')
|
||||
|
||||
self.client.put_object(
|
||||
bucket_name, object_name, io.BytesIO(file_data), len(file_data)
|
||||
)
|
||||
return True
|
||||
except S3Error as err:
|
||||
raise Exception(f"Error occurred while uploading file: {err}")
|
||||
|
||||
def download_document_file(self, tenant_id, document_id, language, version_id, filename):
|
||||
bucket_name = self.generate_bucket_name(tenant_id)
|
||||
object_name = self.generate_object_name(document_id, language, version_id, filename)
|
||||
try:
|
||||
response = self.client.get_object(bucket_name, object_name)
|
||||
return response.read()
|
||||
except S3Error as err:
|
||||
raise Exception(f"Error occurred while downloading file: {err}")
|
||||
|
||||
def list_document_files(self, tenant_id, document_id, language=None, version_id=None):
|
||||
bucket_name = self.generate_bucket_name(tenant_id)
|
||||
prefix = f"{document_id}/"
|
||||
if language:
|
||||
prefix += f"{language}/"
|
||||
if version_id:
|
||||
prefix += f"{version_id}/"
|
||||
try:
|
||||
objects = self.client.list_objects(bucket_name, prefix=prefix, recursive=True)
|
||||
return [obj.object_name for obj in objects]
|
||||
except S3Error as err:
|
||||
raise Exception(f"Error occurred while listing files: {err}")
|
||||
|
||||
def delete_document_file(self, tenant_id, document_id, language, version_id, filename):
|
||||
bucket_name = self.generate_bucket_name(tenant_id)
|
||||
object_name = self.generate_object_name(document_id, language, version_id, filename)
|
||||
try:
|
||||
self.client.remove_object(bucket_name, object_name)
|
||||
return True
|
||||
except S3Error as err:
|
||||
raise Exception(f"Error occurred while deleting file: {err}")
|
||||
@@ -141,7 +141,7 @@ def select_model_variables(tenant):
|
||||
default_headers=portkey_headers)
|
||||
tool_calling_supported = False
|
||||
match llm_model:
|
||||
case 'gpt-4-turbo' | 'gpt-4o':
|
||||
case 'gpt-4-turbo' | 'gpt-4o' | 'gpt-4o-mini':
|
||||
tool_calling_supported = True
|
||||
case _:
|
||||
raise Exception(f'Error setting model variables for tenant {tenant.id} '
|
||||
|
||||
@@ -61,7 +61,7 @@ class Config(object):
|
||||
|
||||
# supported LLMs
|
||||
SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
|
||||
SUPPORTED_LLMS = ['openai.gpt-4o', 'anthropic.claude-3-5-sonnet']
|
||||
SUPPORTED_LLMS = ['openai.gpt-4o', 'anthropic.claude-3-5-sonnet', 'openai.gpt-4o-mini']
|
||||
|
||||
ANTHROPIC_LLM_VERSIONS = {'claude-3-5-sonnet': 'claude-3-5-sonnet-20240620', }
|
||||
|
||||
@@ -71,6 +71,7 @@ class Config(object):
|
||||
# Annotation text chunk length
|
||||
ANNOTATION_TEXT_CHUNK_LENGTH = {
|
||||
'openai.gpt-4o': 10000,
|
||||
'openai.gpt-4o-mini': 10000,
|
||||
'anthropic.claude-3-5-sonnet': 8000
|
||||
}
|
||||
|
||||
@@ -184,12 +185,95 @@ class DevConfig(Config):
|
||||
# PATH settings
|
||||
ffmpeg_path = '/usr/bin/ffmpeg'
|
||||
|
||||
# MINIO
|
||||
MINIO_ENDPOINT = 'minio:9000'
|
||||
MINIO_ACCESS_KEY = 'minioadmin'
|
||||
MINIO_SECRET_KEY = 'minioadmin'
|
||||
|
||||
|
||||
class ProdConfig(Config):
|
||||
DEVELOPMENT = False
|
||||
DEBUG = False
|
||||
# SQLALCHEMY_DATABASE_URI = environ.get('SQLALCHEMY_DATABASE_URI') or \
|
||||
# 'sqlite:///' + os.path.join(basedir, 'db.sqlite')
|
||||
DEVELOPMENT = True
|
||||
DEBUG = True
|
||||
FLASK_DEBUG = True
|
||||
PYCHARM_DEBUG = False
|
||||
DB_HOST = environ.get('DB_HOST', 'bswnz4.stackhero-network.com')
|
||||
DB_USER = environ.get('DB_USER', 'luke_skywalker')
|
||||
DB_PASS = environ.get('DB_PASS', '2MK&1rHmWEydE2rFuJLq*ls%tdkPAk2')
|
||||
DB_NAME = environ.get('DB_NAME', 'eveai')
|
||||
DB_PORT = environ.get('DB_PORT', '5945')
|
||||
|
||||
SQLALCHEMY_DATABASE_URI = f'postgresql+pg8000://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
|
||||
SQLALCHEMY_BINDS = {'public': SQLALCHEMY_DATABASE_URI}
|
||||
EXPLAIN_TEMPLATE_LOADING = False
|
||||
|
||||
# Define the nginx prefix used for the specific apps
|
||||
EVEAI_APP_LOCATION_PREFIX = '/admin'
|
||||
EVEAI_CHAT_LOCATION_PREFIX = '/chat'
|
||||
|
||||
# flask-mailman settings
|
||||
MAIL_USERNAME = 'eveai_super@flow-it.net'
|
||||
MAIL_PASSWORD = '$6xsWGbNtx$CFMQZqc*'
|
||||
|
||||
# file upload settings
|
||||
UPLOAD_FOLDER = '/app/tenant_files'
|
||||
|
||||
REDIS_USER = 'admin'
|
||||
REDIS_PASS = 'b32vtDtLriSY1fL2zGrZg8IZKI0g9ucsLtVNanRFAras6oZ51wjVNB1Y05uG7uEw'
|
||||
REDIS_URL = '8bciqc.stackhero-network.com'
|
||||
REDIS_PORT = '9961'
|
||||
REDIS_BASE_URI = f'redis://{REDIS_USER}:{REDIS_PASS}@{REDIS_URL}:{REDIS_PORT}'
|
||||
|
||||
# Celery settings
|
||||
# eveai_app Redis Settings
|
||||
CELERY_BROKER_URL = f'{REDIS_BASE_URI}/0'
|
||||
CELERY_RESULT_BACKEND = f'{REDIS_BASE_URI}/0'
|
||||
# eveai_chat Redis Settings
|
||||
CELERY_BROKER_URL_CHAT = f'{REDIS_BASE_URI}/3'
|
||||
CELERY_RESULT_BACKEND_CHAT = f'{REDIS_BASE_URI}/3'
|
||||
|
||||
# Session settings
|
||||
SESSION_REDIS = redis.from_url(f'{REDIS_BASE_URI}/2')
|
||||
|
||||
# OpenAI API Keys
|
||||
OPENAI_API_KEY = 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7'
|
||||
|
||||
# Groq API Keys
|
||||
GROQ_API_KEY = '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'
|
||||
|
||||
# Anthropic API Keys
|
||||
ANTHROPIC_API_KEY = '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'
|
||||
|
||||
# Portkey API Keys
|
||||
PORTKEY_API_KEY = 'T2Dt4QTpgCvWxa1OftYCJtj7NcDZ'
|
||||
|
||||
# Unstructured settings
|
||||
UNSTRUCTURED_API_KEY = 'pDgCrXumYhM3CNvjvwV8msMldXC3uw'
|
||||
UNSTRUCTURED_BASE_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io'
|
||||
UNSTRUCTURED_FULL_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io/general/v0/general'
|
||||
|
||||
# SocketIO settings
|
||||
SOCKETIO_MESSAGE_QUEUE = f'{REDIS_BASE_URI}/1'
|
||||
SOCKETIO_CORS_ALLOWED_ORIGINS = '*'
|
||||
SOCKETIO_LOGGER = True
|
||||
SOCKETIO_ENGINEIO_LOGGER = True
|
||||
SOCKETIO_PING_TIMEOUT = 20000
|
||||
SOCKETIO_PING_INTERVAL = 25000
|
||||
SOCKETIO_MAX_IDLE_TIME = timedelta(minutes=60) # Changing this value ==> change maxConnectionDuration value in
|
||||
# eveai-chat-widget.js
|
||||
|
||||
# Google Cloud settings
|
||||
GC_PROJECT_NAME = 'eveai-420711'
|
||||
GC_LOCATION = 'europe-west1'
|
||||
GC_KEY_RING = 'eveai-chat'
|
||||
GC_CRYPTO_KEY = 'envelope-encryption-key'
|
||||
|
||||
# JWT settings
|
||||
JWT_SECRET_KEY = 'bsdMkmQ8ObfMD52yAFg4trrvjgjMhuIqg2fjDpD/JqvgY0ccCcmlsEnVFmR79WPiLKEA3i8a5zmejwLZKl4v9Q=='
|
||||
|
||||
# PATH settings
|
||||
ffmpeg_path = '/usr/bin/ffmpeg'
|
||||
|
||||
|
||||
config = {
|
||||
|
||||
79
config/prompts/openai/gpt-4o-mini.yaml
Normal file
79
config/prompts/openai/gpt-4o-mini.yaml
Normal file
@@ -0,0 +1,79 @@
|
||||
html_parse: |
|
||||
You are a top administrative assistant specialized in transforming given HTML into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
|
||||
|
||||
# Best practices are:
|
||||
- Respect wordings and language(s) used in the HTML.
|
||||
- The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
|
||||
- Sub-headers can be used as lists. This is true when a header is followed by a series of sub-headers without content (paragraphs or listed items). Present those sub-headers as a list.
|
||||
- Be careful of encoding of the text. Everything needs to be human readable.
|
||||
|
||||
Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input html file. Answer with the pure markdown, without any other text.
|
||||
|
||||
HTML is between triple backquotes.
|
||||
|
||||
```{html}```
|
||||
|
||||
pdf_parse: |
|
||||
You are a top administrative aid specialized in transforming given PDF-files into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
|
||||
|
||||
# Best practices are:
|
||||
- Respect wordings and language(s) used in the PDF.
|
||||
- The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
|
||||
- When headings are numbered, show the numbering and define the header level.
|
||||
- A new item is started when a <return> is found before a full line is reached. In order to know the number of characters in a line, please check the document and the context within the document (e.g. an image could limit the number of characters temporarily).
|
||||
- Paragraphs are to be stripped of newlines so they become easily readable.
|
||||
- Be careful of encoding of the text. Everything needs to be human readable.
|
||||
|
||||
Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input pdf content. Answer with the pure markdown, without any other text.
|
||||
|
||||
PDF content is between triple backquotes.
|
||||
|
||||
```{pdf_content}```
|
||||
|
||||
summary: |
|
||||
Write a concise summary of the text in {language}. The text is delimited between triple backquotes.
|
||||
```{text}```
|
||||
|
||||
rag: |
|
||||
Answer the question based on the following context, delimited between triple backquotes.
|
||||
{tenant_context}
|
||||
Use the following {language} in your communication, and cite the sources used.
|
||||
If the question cannot be answered using the given context, say "I have insufficient information to answer this question."
|
||||
Context:
|
||||
```{context}```
|
||||
Question:
|
||||
{question}
|
||||
|
||||
history: |
|
||||
You are a helpful assistant that details a question based on a previous context,
|
||||
in such a way that the question is understandable without the previous context.
|
||||
The context is a conversation history, with the HUMAN asking questions, the AI answering questions.
|
||||
The history is delimited between triple backquotes.
|
||||
You answer by stating the question in {language}.
|
||||
History:
|
||||
```{history}```
|
||||
Question to be detailed:
|
||||
{question}
|
||||
|
||||
encyclopedia: |
|
||||
You have a lot of background knowledge, and as such you are some kind of
|
||||
'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question.
|
||||
If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
|
||||
Question:
|
||||
{question}
|
||||
|
||||
transcript: |
|
||||
You are a top administrative assistant specialized in transforming given transcriptions into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system. The transcriptions originate from podcast, videos and similar material.
|
||||
|
||||
# Best practices and steps are:
|
||||
- Respect wordings and language(s) used in the transcription. Main language is {language}.
|
||||
- Sometimes, the transcript contains speech of several people participating in a conversation. Although these are not obvious from reading the file, try to detect when other people are speaking.
|
||||
- Divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
|
||||
- annotate the text to identify these logical parts using headings in {language}.
|
||||
- improve errors in the transcript given the context, but do not change the meaning and intentions of the transcription.
|
||||
|
||||
Process the file carefully, and take a stepped approach. The resulting markdown should be the result of processing the complete input transcription. Answer with the pure markdown, without any other text.
|
||||
|
||||
The transcript is between triple backquotes.
|
||||
|
||||
```{transcript}```
|
||||
@@ -15,6 +15,9 @@ x-common-variables: &common-variables
|
||||
DB_NAME: eveai
|
||||
FLASK_ENV: development
|
||||
FLASK_DEBUG: 1
|
||||
MINIO_ENDPOINT: minio:9000
|
||||
MINIO_ACCESS_KEY: minioadmin
|
||||
MINIO_SECRET_KEY: minioadmin
|
||||
|
||||
services:
|
||||
nginx:
|
||||
@@ -48,12 +51,13 @@ services:
|
||||
- ../scripts:/app/scripts
|
||||
- ../patched_packages:/app/patched_packages
|
||||
- ./logs:/app/logs
|
||||
- ./tenant_files:/app/tenant_files
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
minio:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:5001/health"]
|
||||
interval: 10s
|
||||
@@ -76,12 +80,13 @@ services:
|
||||
- ../scripts:/app/scripts
|
||||
- ../patched_packages:/app/patched_packages
|
||||
- ./logs:/app/logs
|
||||
- ./tenant_files:/app/tenant_files
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
minio:
|
||||
condition: service_healthy
|
||||
# healthcheck:
|
||||
# test: [ "CMD", "curl", "-f", "http://localhost:5001/health" ]
|
||||
# interval: 10s
|
||||
@@ -174,7 +179,30 @@ services:
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
#volumes:
|
||||
|
||||
minio:
|
||||
image: minio/minio
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
expose:
|
||||
- 9000
|
||||
volumes:
|
||||
- ./minio/data:/data
|
||||
- ./minio/config:/root/.minio
|
||||
environment:
|
||||
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin}
|
||||
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin}
|
||||
command: server /data --console-address ":9001"
|
||||
healthcheck:
|
||||
test: [ "CMD", "mc", "ready", "local" ]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
volumes:
|
||||
minio_data:
|
||||
# db-data:
|
||||
# redis-data:
|
||||
# tenant-files:
|
||||
|
||||
@@ -6,7 +6,8 @@ from flask_security.signals import user_authenticated
|
||||
from werkzeug.middleware.proxy_fix import ProxyFix
|
||||
import logging.config
|
||||
|
||||
from common.extensions import db, migrate, bootstrap, security, mail, login_manager, cors, kms_client, csrf, session
|
||||
from common.extensions import (db, migrate, bootstrap, security, mail, login_manager, cors, kms_client, csrf, session,
|
||||
minio_client)
|
||||
from common.models.user import User, Role, Tenant, TenantDomain
|
||||
import common.models.interaction
|
||||
from config.logging_config import LOGGING
|
||||
@@ -102,6 +103,7 @@ def register_extensions(app):
|
||||
cors.init_app(app)
|
||||
kms_client.init_app(app)
|
||||
session.init_app(app)
|
||||
minio_client.init_app(app)
|
||||
|
||||
|
||||
# Register Blueprints
|
||||
|
||||
@@ -14,9 +14,10 @@ import requests
|
||||
from requests.exceptions import SSLError
|
||||
from urllib.parse import urlparse
|
||||
import io
|
||||
from minio.error import S3Error
|
||||
|
||||
from common.models.document import Document, DocumentVersion
|
||||
from common.extensions import db
|
||||
from common.extensions import db, minio_client
|
||||
from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddYoutubeForm, \
|
||||
AddURLsForm
|
||||
from common.utils.middleware import mw_before_request
|
||||
@@ -558,33 +559,34 @@ def upload_file_for_version(doc_vers, file, extension):
|
||||
doc_vers.file_name = doc_vers.calc_file_name()
|
||||
doc_vers.file_location = doc_vers.calc_file_location()
|
||||
|
||||
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], doc_vers.file_location)
|
||||
if not os.path.exists(upload_path):
|
||||
os.makedirs(upload_path, exist_ok=True)
|
||||
if isinstance(file, FileStorage):
|
||||
file.save(os.path.join(upload_path, doc_vers.file_name))
|
||||
elif isinstance(file, io.BytesIO):
|
||||
# It's a BytesIO object, handle accordingly
|
||||
# Example: write content to a file manually
|
||||
with open(os.path.join(upload_path, doc_vers.file_name), 'wb') as f:
|
||||
f.write(file.getvalue())
|
||||
elif isinstance(file, str):
|
||||
# It's a string, handle accordingly
|
||||
with open(os.path.join(upload_path, doc_vers.file_name), 'w') as f:
|
||||
f.write(file)
|
||||
else:
|
||||
raise TypeError('Unsupported file type.')
|
||||
# Normally, the tenant bucket should exist. But let's be on the safe side if a migration took place.
|
||||
tenant_id = session['tenant']['id']
|
||||
minio_client.create_tenant_bucket(tenant_id)
|
||||
|
||||
try:
|
||||
minio_client.upload_document_file(
|
||||
tenant_id,
|
||||
doc_vers.doc_id,
|
||||
doc_vers.language,
|
||||
doc_vers.id,
|
||||
doc_vers.file_name,
|
||||
file
|
||||
)
|
||||
db.session.commit()
|
||||
current_app.logger.info(f'Successfully saved document to MinIO for tenant {tenant_id} for '
|
||||
f'document version {doc_vers.id} while uploading file.')
|
||||
except S3Error as e:
|
||||
db.session.rollback()
|
||||
flash('Error saving document to MinIO.', 'error')
|
||||
current_app.logger.error(
|
||||
f'Error saving document to MinIO for tenant {tenant_id}: {e}')
|
||||
raise
|
||||
except SQLAlchemyError as e:
|
||||
db.session.rollback()
|
||||
flash('Error saving document.', 'error')
|
||||
flash('Error saving document metadata.', 'error')
|
||||
current_app.logger.error(
|
||||
f'Error saving document for tenant {session["tenant"]["id"]} while uploading file: {e}')
|
||||
|
||||
current_app.logger.info(f'Succesfully saved document for tenant {session['tenant']['id']} for '
|
||||
f'document version {doc_vers.id} while uploading file.')
|
||||
f'Error saving document metadata for tenant {tenant_id}: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def fetch_html(url):
|
||||
|
||||
@@ -8,7 +8,7 @@ from sqlalchemy.exc import SQLAlchemyError
|
||||
import ast
|
||||
|
||||
from common.models.user import User, Tenant, Role, TenantDomain
|
||||
from common.extensions import db, kms_client, security
|
||||
from common.extensions import db, kms_client, security, minio_client
|
||||
from common.utils.security_utils import send_confirmation_email, send_reset_email
|
||||
from .user_forms import TenantForm, CreateUserForm, EditUserForm, TenantDomainForm
|
||||
from common.utils.database import Database
|
||||
@@ -61,12 +61,13 @@ def tenant():
|
||||
# rag_tuning=form.rag_tuning.data)
|
||||
|
||||
# Handle Embedding Variables
|
||||
new_tenant.html_tags = form.html_tags.data.split(',') if form.html_tags.data else []
|
||||
new_tenant.html_end_tags = form.html_end_tags.data.split(',') if form.html_end_tags.data else []
|
||||
new_tenant.html_included_elements = form.html_included_elements.data.split(
|
||||
',') if form.html_included_elements.data else []
|
||||
new_tenant.html_excluded_elements = form.html_excluded_elements.data.split(
|
||||
',') if form.html_excluded_elements.data else []
|
||||
new_tenant.html_tags = [tag.strip() for tag in form.html_tags.data.split(',')] if form.html_tags.data else []
|
||||
new_tenant.html_end_tags = [tag.strip() for tag in form.html_end_tags.data.split(',')] \
|
||||
if form.html_end_tags.data else []
|
||||
new_tenant.html_included_elements = [tag.strip() for tag in form.html_included_elements.data.split(',')] \
|
||||
if form.html_included_elements.data else []
|
||||
new_tenant.html_excluded_elements = [tag.strip() for tag in form.html_excluded_elements.data.split(',')] \
|
||||
if form.html_excluded_elements.data else []
|
||||
|
||||
current_app.logger.debug(f'html_tags: {new_tenant.html_tags},'
|
||||
f'html_end_tags: {new_tenant.html_end_tags},'
|
||||
@@ -87,11 +88,17 @@ def tenant():
|
||||
flash(f'Failed to add tenant to database. Error: {str(e)}')
|
||||
return render_template('user/tenant.html', form=form)
|
||||
|
||||
# Create schema for new tenant
|
||||
current_app.logger.info(f"Successfully created tenant {new_tenant.id} in Database")
|
||||
flash(f"Successfully created tenant {new_tenant.id} in Database")
|
||||
|
||||
# Create schema for new tenant
|
||||
current_app.logger.info(f"Creating schema for tenant {new_tenant.id}")
|
||||
Database(new_tenant.id).create_tenant_schema()
|
||||
|
||||
# Create MinIO bucket for new tenant
|
||||
current_app.logger.info(f"Creating MinIO bucket for tenant {new_tenant.id}")
|
||||
minio_client.create_tenant_bucket(new_tenant.id)
|
||||
|
||||
return redirect(prefixed_url_for('basic_bp.index'))
|
||||
else:
|
||||
form_validation_failed(request, form)
|
||||
|
||||
@@ -81,6 +81,12 @@ def ask_question(tenant_id, question, language, session_id, user_timezone):
|
||||
current_app.logger.error(f'ask_question: Error initializing chat session in database: {e}')
|
||||
raise
|
||||
|
||||
if tenant.rag_tuning:
|
||||
current_app.rag_tuning_logger.debug(f'Received question for tenant {tenant_id}:\n{question}. Processing...')
|
||||
current_app.rag_tuning_logger.debug(f'Tenant Information: \n{tenant.to_dict()}')
|
||||
current_app.rag_tuning_logger.debug(f'===================================================================')
|
||||
current_app.rag_tuning_logger.debug(f'===================================================================')
|
||||
|
||||
result, interaction = answer_using_tenant_rag(question, language, tenant, chat_session)
|
||||
result['algorithm'] = current_app.config['INTERACTION_ALGORITHMS']['RAG_TENANT']['name']
|
||||
result['interaction_id'] = interaction.id
|
||||
@@ -116,6 +122,9 @@ def answer_using_tenant_rag(question, language, tenant, chat_session):
|
||||
|
||||
detailed_question = detail_question(question, language, model_variables, chat_session.session_id)
|
||||
current_app.logger.debug(f'Original question:\n {question}\n\nDetailed question: {detailed_question}')
|
||||
if tenant.rag_tuning:
|
||||
current_app.rag_tuning_logger.debug(f'Detailed Question for tenant {tenant.id}:\n{question}.')
|
||||
current_app.rag_tuning_logger.debug(f'-------------------------------------------------------------------')
|
||||
new_interaction.detailed_question = detailed_question
|
||||
new_interaction.detailed_question_at = dt.now(tz.utc)
|
||||
|
||||
@@ -126,6 +135,9 @@ def answer_using_tenant_rag(question, language, tenant, chat_session):
|
||||
full_template = replace_variable_in_template(language_template, "{tenant_context}", model_variables['rag_context'])
|
||||
rag_prompt = ChatPromptTemplate.from_template(full_template)
|
||||
setup_and_retrieval = RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
|
||||
if tenant.rag_tuning:
|
||||
current_app.rag_tuning_logger.debug(f'Full prompt for tenant {tenant.id}:\n{full_template}.')
|
||||
current_app.rag_tuning_logger.debug(f'-------------------------------------------------------------------')
|
||||
|
||||
new_interaction_embeddings = []
|
||||
if not model_variables['cited_answer_cls']: # The model doesn't support structured feedback
|
||||
@@ -151,6 +163,11 @@ def answer_using_tenant_rag(question, language, tenant, chat_session):
|
||||
current_app.logger.debug(f'ask_question: result answer: {result['answer']}')
|
||||
current_app.logger.debug(f'ask_question: result citations: {result["citations"]}')
|
||||
current_app.logger.debug(f'ask_question: insufficient information: {result["insufficient_info"]}')
|
||||
if tenant.rag_tuning:
|
||||
current_app.rag_tuning_logger.debug(f'ask_question: result answer: {result['answer']}')
|
||||
current_app.rag_tuning_logger.debug(f'ask_question: result citations: {result["citations"]}')
|
||||
current_app.rag_tuning_logger.debug(f'ask_question: insufficient information: {result["insufficient_info"]}')
|
||||
current_app.rag_tuning_logger.debug(f'-------------------------------------------------------------------')
|
||||
new_interaction.answer = result['answer']
|
||||
|
||||
# Filter out the existing Embedding IDs
|
||||
@@ -161,7 +178,11 @@ def answer_using_tenant_rag(question, language, tenant, chat_session):
|
||||
.all()
|
||||
)
|
||||
existing_embedding_ids = [emb.id for emb in embeddings]
|
||||
urls = [emb.document_version.url for emb in embeddings]
|
||||
urls = list(set(emb.document_version.url for emb in embeddings))
|
||||
if tenant.rag_tuning:
|
||||
current_app.rag_tuning_logger.debug(f'Referenced documents for answer for tenant {tenant.id}:\n')
|
||||
current_app.rag_tuning_logger.debug(f'{urls}')
|
||||
current_app.rag_tuning_logger.debug(f'-------------------------------------------------------------------')
|
||||
|
||||
for emb_id in existing_embedding_ids:
|
||||
new_interaction_embedding = InteractionEmbedding(embedding_id=emb_id)
|
||||
|
||||
@@ -3,7 +3,7 @@ import logging.config
|
||||
from flask import Flask
|
||||
|
||||
from common.utils.celery_utils import make_celery, init_celery
|
||||
from common.extensions import db
|
||||
from common.extensions import db, minio_client
|
||||
from config.logging_config import LOGGING
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ def create_app(config_file=None):
|
||||
|
||||
def register_extensions(app):
|
||||
db.init_app(app)
|
||||
minio_client.init_app(app)
|
||||
|
||||
|
||||
app, celery = create_app()
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import io
|
||||
import os
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
import subprocess
|
||||
@@ -21,7 +22,7 @@ import PyPDF2
|
||||
from pydub import AudioSegment
|
||||
import tempfile
|
||||
|
||||
from common.extensions import db
|
||||
from common.extensions import db, minio_client
|
||||
from common.models.document import DocumentVersion, Embedding
|
||||
from common.models.user import Tenant
|
||||
from common.utils.celery_utils import current_celery
|
||||
@@ -32,11 +33,6 @@ from common.utils.os_utils import safe_remove, sync_folder
|
||||
|
||||
@current_celery.task(name='create_embeddings', queue='embeddings')
|
||||
def create_embeddings(tenant_id, document_version_id):
|
||||
# Setup Remote Debugging only if PYCHARM_DEBUG=True
|
||||
if current_app.config['PYCHARM_DEBUG']:
|
||||
import pydevd_pycharm
|
||||
pydevd_pycharm.settrace('localhost', port=50170, stdoutToServer=True, stderrToServer=True)
|
||||
|
||||
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.')
|
||||
|
||||
try:
|
||||
@@ -50,6 +46,7 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
|
||||
# Select variables to work with depending on tenant and model
|
||||
model_variables = select_model_variables(tenant)
|
||||
current_app.logger.debug(f'Model variables: {model_variables}')
|
||||
|
||||
# Retrieve document version to process
|
||||
document_version = DocumentVersion.query.get(document_version_id)
|
||||
@@ -107,33 +104,20 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
|
||||
|
||||
def process_pdf(tenant, model_variables, document_version):
|
||||
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location)
|
||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location,
|
||||
document_version.file_name)
|
||||
if os.path.exists(file_path):
|
||||
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, document_version.file_name)
|
||||
|
||||
pdf_text = ''
|
||||
# Function to extract text from PDF and return as string
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
for page_num in range(len(reader.pages)):
|
||||
page = reader.pages[page_num]
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
|
||||
for page in pdf_reader.pages:
|
||||
pdf_text += page.extract_text()
|
||||
else:
|
||||
current_app.logger.error(f'The physical file for document version {document_version.id} '
|
||||
f'for tenant {tenant.id} '
|
||||
f'at {file_path} does not exist')
|
||||
create_embeddings.update_state(state=states.FAILURE)
|
||||
raise
|
||||
|
||||
markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
output_file = os.path.join(base_path, markdown_file_name)
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(markdown)
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
|
||||
@@ -175,43 +159,29 @@ def delete_embeddings_for_document_version(document_version):
|
||||
|
||||
|
||||
def process_html(tenant, model_variables, document_version):
|
||||
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, document_version.file_name)
|
||||
html_content = file_data.decode('utf-8')
|
||||
|
||||
# The tags to be considered can be dependent on the tenant
|
||||
html_tags = model_variables['html_tags']
|
||||
html_end_tags = model_variables['html_end_tags']
|
||||
html_included_elements = model_variables['html_included_elements']
|
||||
html_excluded_elements = model_variables['html_excluded_elements']
|
||||
|
||||
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location)
|
||||
|
||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location,
|
||||
document_version.file_name)
|
||||
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'rb') as f:
|
||||
html_content = f.read()
|
||||
else:
|
||||
current_app.logger.error(f'The physical file for document version {document_version.id} '
|
||||
f'for tenant {tenant.id} '
|
||||
f'at {file_path} does not exist')
|
||||
create_embeddings.update_state(state=states.FAILURE)
|
||||
raise
|
||||
|
||||
extracted_html, title = parse_html(tenant, html_content, html_tags, included_elements=html_included_elements,
|
||||
excluded_elements=html_excluded_elements)
|
||||
|
||||
extracted_file_name = f'{document_version.id}-extracted.html'
|
||||
output_file = os.path.join(base_path, extracted_file_name)
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(extracted_html)
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
extracted_file_name, extracted_html.encode())
|
||||
|
||||
markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
output_file = os.path.join(base_path, markdown_file_name)
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(markdown)
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
|
||||
@@ -222,7 +192,7 @@ def process_html(tenant, model_variables, document_version):
|
||||
else:
|
||||
document_version.system_context = (f'Title: {title}\n')
|
||||
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
try:
|
||||
@@ -241,16 +211,17 @@ def process_html(tenant, model_variables, document_version):
|
||||
f'on document version {document_version.id} :-)')
|
||||
|
||||
|
||||
def enrich_chunks(tenant, document_version, chunks):
|
||||
def enrich_chunks(tenant, document_version, title, chunks):
|
||||
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
current_app.logger.debug(f'Nr of chunks: {len(chunks)}')
|
||||
chunk_total_context = (f'Filename: {document_version.file_name}\n'
|
||||
f'User Context:{document_version.user_context}\n'
|
||||
f'User Context:\n{document_version.user_context}\n\n'
|
||||
f'{document_version.system_context}\n\n')
|
||||
enriched_chunks = []
|
||||
initial_chunk = (f'Filename: {document_version.file_name}\n'
|
||||
f'User Context:\n{document_version.user_context}\n\n'
|
||||
f'Title: {title}\n'
|
||||
f'{chunks[0]}')
|
||||
|
||||
enriched_chunks.append(initial_chunk)
|
||||
@@ -311,7 +282,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
text_to_summarize = doc_creator.create_documents(chunk)
|
||||
|
||||
try:
|
||||
summary = chain.run(text_to_summarize)
|
||||
summary = chain.invoke({"text": text_to_summarize})
|
||||
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}.')
|
||||
return summary
|
||||
@@ -391,23 +362,26 @@ def process_youtube(tenant, model_variables, document_version):
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
|
||||
# Remove existing files (in case of a re-processing of the file
|
||||
safe_remove(os.path.join(base_path, download_file_name))
|
||||
safe_remove(os.path.join(base_path, compressed_file_name))
|
||||
safe_remove(os.path.join(base_path, transcription_file_name))
|
||||
safe_remove(os.path.join(base_path, markdown_file_name))
|
||||
sync_folder(base_path)
|
||||
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, download_file_name)
|
||||
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, compressed_file_name)
|
||||
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, transcription_file_name)
|
||||
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, markdown_file_name)
|
||||
|
||||
of, title, description, author = download_youtube(document_version.url, base_path, download_file_name, tenant)
|
||||
of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
|
||||
download_file_name)
|
||||
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
|
||||
compress_audio(base_path, download_file_name, compressed_file_name, tenant)
|
||||
transcribe_audio(base_path, compressed_file_name, transcription_file_name,
|
||||
document_version.language, tenant, model_variables)
|
||||
annotate_transcription(base_path, transcription_file_name, markdown_file_name,
|
||||
document_version.language, tenant, model_variables)
|
||||
compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
|
||||
transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
|
||||
annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
@@ -427,27 +401,41 @@ def process_youtube(tenant, model_variables, document_version):
|
||||
f'on Youtube document version {document_version.id} :-)')
|
||||
|
||||
|
||||
def download_youtube(url, file_location, file_name, tenant):
|
||||
def download_youtube(url, tenant_id, document_version, file_name):
|
||||
try:
|
||||
current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
|
||||
current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
|
||||
yt = YouTube(url)
|
||||
stream = yt.streams.get_audio_only()
|
||||
output_file = stream.download(output_path=file_location, filename=file_name)
|
||||
current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
|
||||
return output_file, yt.title, yt.description, yt.author
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
stream.download(output_path=temp_file.name)
|
||||
with open(temp_file.name, 'rb') as f:
|
||||
file_data = f.read()
|
||||
|
||||
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
|
||||
file_name, file_data)
|
||||
|
||||
current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
|
||||
return file_name, yt.title, yt.description, yt.author
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for '
|
||||
f'tenant: {tenant.id} with error: {e}')
|
||||
current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def compress_audio(file_location, input_file, output_file, tenant):
|
||||
def compress_audio(tenant_id, document_version, input_file, output_file):
|
||||
try:
|
||||
current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
|
||||
current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
|
||||
|
||||
# Run the compression script
|
||||
input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
|
||||
document_version.id, input_file)
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
|
||||
temp_input.write(input_data)
|
||||
temp_input.flush()
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
|
||||
result = subprocess.run(
|
||||
['scripts/compress.sh', '-d', file_location, '-i', input_file, '-o', output_file],
|
||||
['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
@@ -455,55 +443,30 @@ def compress_audio(file_location, input_file, output_file, tenant):
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"Compression failed: {result.stderr}")
|
||||
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
with open(temp_output.name, 'rb') as f:
|
||||
compressed_data = f.read()
|
||||
|
||||
# Additional check for file stability
|
||||
previous_size = -1
|
||||
stable_count = 0
|
||||
max_attempts = 12 # 1 minute total wait time
|
||||
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
|
||||
output_file, compressed_data)
|
||||
|
||||
for _ in range(max_attempts):
|
||||
if os.path.exists(output_file_path):
|
||||
current_size = os.path.getsize(output_file_path)
|
||||
if current_size == previous_size:
|
||||
stable_count += 1
|
||||
if stable_count >= 3: # File size hasn't changed for 3 checks
|
||||
break
|
||||
else:
|
||||
stable_count = 0
|
||||
previous_size = current_size
|
||||
gevent.sleep(5)
|
||||
|
||||
if stable_count < 3:
|
||||
raise Exception("File size did not stabilize within the expected time")
|
||||
|
||||
current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
|
||||
return output_file_path
|
||||
current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
|
||||
current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables):
|
||||
def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
|
||||
try:
|
||||
current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}')
|
||||
current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
|
||||
client = model_variables['transcription_client']
|
||||
model = model_variables['transcription_model']
|
||||
input_file_path = os.path.join(file_location, input_file)
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
|
||||
# Wait for the input file to exist
|
||||
count = 0
|
||||
while not os.path.exists(input_file_path) and count < 10:
|
||||
gevent.sleep(1)
|
||||
current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
|
||||
count += 1
|
||||
# Download the audio file from MinIO
|
||||
audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
|
||||
document_version.id, input_file)
|
||||
|
||||
if not os.path.exists(input_file_path):
|
||||
raise FileNotFoundError(f"Input file {input_file_path} not found after waiting.")
|
||||
|
||||
# Load the audio file
|
||||
audio = AudioSegment.from_file(input_file_path)
|
||||
# Load the audio data into pydub
|
||||
audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
|
||||
|
||||
# Define segment length (e.g., 10 minutes)
|
||||
segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
|
||||
@@ -512,14 +475,16 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
|
||||
|
||||
# Split audio into segments and transcribe each
|
||||
for i, chunk in enumerate(audio[::segment_length]):
|
||||
current_app.logger.debug(f'Transcribing chunk {i} of {len(audio) // segment_length} ')
|
||||
current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
|
||||
chunk.export(temp_audio.name, format="mp3")
|
||||
|
||||
with open(temp_audio.name, 'rb') as audio_segment:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
file=audio_segment,
|
||||
model=model,
|
||||
language=language,
|
||||
language=document_version.language,
|
||||
response_format='verbose_json',
|
||||
)
|
||||
|
||||
@@ -530,20 +495,25 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
|
||||
# Combine all transcriptions
|
||||
full_transcription = " ".join(transcriptions)
|
||||
|
||||
# Write the full transcription to the output file
|
||||
with open(output_file_path, 'w') as f:
|
||||
f.write(full_transcription)
|
||||
# Upload the full transcription to MinIO
|
||||
minio_client.upload_document_file(
|
||||
tenant_id,
|
||||
document_version.doc_id,
|
||||
document_version.language,
|
||||
document_version.id,
|
||||
output_file,
|
||||
full_transcription.encode('utf-8')
|
||||
)
|
||||
|
||||
current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
|
||||
current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, '
|
||||
f'with error: {e}')
|
||||
current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def annotate_transcription(file_location, input_file, output_file, language, tenant, model_variables):
|
||||
def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
|
||||
try:
|
||||
current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
|
||||
current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')
|
||||
|
||||
char_splitter = CharacterTextSplitter(separator='.',
|
||||
chunk_size=model_variables['annotation_chunk_length'],
|
||||
@@ -552,18 +522,21 @@ def annotate_transcription(file_location, input_file, output_file, language, ten
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
# ("###", "Header 3"),
|
||||
]
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
|
||||
llm = model_variables['llm']
|
||||
template = model_variables['transcript_template']
|
||||
language_template = create_language_template(template, language)
|
||||
language_template = create_language_template(template, document_version.language)
|
||||
transcript_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
with open(os.path.join(file_location, input_file), 'r') as f:
|
||||
transcript = f.read()
|
||||
|
||||
# Download the transcription file from MinIO
|
||||
transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
|
||||
document_version.language, document_version.id,
|
||||
input_file)
|
||||
transcript = transcript_data.decode('utf-8')
|
||||
|
||||
chain = setup | transcript_prompt | llm | output_parser
|
||||
|
||||
@@ -598,38 +571,53 @@ def annotate_transcription(file_location, input_file, output_file, language, ten
|
||||
markdown_chunks.pop()
|
||||
all_markdown_chunks += markdown_chunks
|
||||
|
||||
|
||||
all_markdown_chunks += [last_markdown_chunk]
|
||||
|
||||
annotated_transcript = '\n'.join(all_markdown_chunks)
|
||||
|
||||
with open(os.path.join(file_location, output_file), 'w') as f:
|
||||
f.write(annotated_transcript)
|
||||
# Upload the annotated transcript to MinIO
|
||||
minio_client.upload_document_file(
|
||||
tenant.id,
|
||||
document_version.doc_id,
|
||||
document_version.language,
|
||||
document_version.id,
|
||||
output_file,
|
||||
annotated_transcript.encode('utf-8')
|
||||
)
|
||||
|
||||
current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}')
|
||||
current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, '
|
||||
f'with error: {e}')
|
||||
current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def create_potential_chunks_for_markdown(base_path, input_file, tenant):
|
||||
current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}')
|
||||
markdown = ''
|
||||
with open(os.path.join(base_path, input_file), 'r') as f:
|
||||
markdown = f.read()
|
||||
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
|
||||
try:
|
||||
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
|
||||
|
||||
# Download the markdown file from MinIO
|
||||
markdown_data = minio_client.download_document_file(tenant_id,
|
||||
document_version.doc_id,
|
||||
document_version.language,
|
||||
document_version.id,
|
||||
input_file
|
||||
)
|
||||
markdown = markdown_data.decode('utf-8')
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
# ("###", "Header 3"),
|
||||
]
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||
|
||||
current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
|
||||
return potential_chunks
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
|
||||
28
nginx/public/chat_eveai_mini.html
Normal file
28
nginx/public/chat_eveai_mini.html
Normal file
@@ -0,0 +1,28 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Chat Client EveAI Mini</title>
|
||||
<link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
|
||||
<script src="https://cdn.socket.io/4.0.1/socket.io.min.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
<script src="/static/js/eveai-sdk.js" defer></script>
|
||||
<script src="/static/js/eveai-chat-widget.js" defer></script>
|
||||
<link rel="stylesheet" href="/static/css/eveai-chat-style.css">
|
||||
</head>
|
||||
<body>
|
||||
<div id="chat-container"></div>
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const eveAI = new EveAI(
|
||||
'6',
|
||||
'EveAI-CHAT-3622-2083-4559-6024-8786',
|
||||
'http://macstudio.ask-eve-ai-local.com',
|
||||
'en'
|
||||
);
|
||||
eveAI.initializeChat('chat-container');
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
28
nginx/public/chat_flow.html
Normal file
28
nginx/public/chat_flow.html
Normal file
@@ -0,0 +1,28 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Chat Client AE</title>
|
||||
<link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
|
||||
<script src="https://cdn.socket.io/4.0.1/socket.io.min.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
<script src="/static/js/eveai-sdk.js" defer></script>
|
||||
<script src="/static/js/eveai-chat-widget.js" defer></script>
|
||||
<link rel="stylesheet" href="/static/css/eveai-chat-style.css">
|
||||
</head>
|
||||
<body>
|
||||
<div id="chat-container"></div>
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const eveAI = new EveAI(
|
||||
'2',
|
||||
'EveAI-CHAT-8716-3188-4285-8044-9932',
|
||||
'http://macstudio.ask-eve-ai-local.com',
|
||||
'en'
|
||||
);
|
||||
eveAI.initializeChat('chat-container');
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -76,3 +76,7 @@ groq~=0.9.0
|
||||
pydub~=0.25.1
|
||||
argparse~=1.4.0
|
||||
portkey_ai~=1.7.0
|
||||
|
||||
minio~=7.2.7
|
||||
Werkzeug~=3.0.3
|
||||
itsdangerous~=2.2.0
|
||||
@@ -7,7 +7,7 @@ export PYTHONPATH="$PROJECT_DIR/patched_packages:$PYTHONPATH:$PROJECT_DIR" # In
|
||||
# Set flask environment variables
|
||||
#export FLASK_ENV=development # Use 'production' as appropriate
|
||||
#export FLASK_DEBUG=1 # Use 0 for production
|
||||
print "Starting EveAI Chat"
|
||||
echo "Starting EveAI Chat"
|
||||
|
||||
# Start Flask app
|
||||
gunicorn -w 4 -k geventwebsocket.gunicorn.workers.GeventWebSocketWorker -b 0.0.0.0:5002 scripts.run_eveai_chat:app
|
||||
|
||||
Reference in New Issue
Block a user