- Introduction of dynamic Retrievers & Specialists

- Introduction of dynamic Processors
- Introduction of caching system
- Introduction of a better template manager
- Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists
- Start adaptation of chat client
This commit is contained in:
Josako
2024-11-15 10:00:53 +01:00
parent 55a8a95f79
commit 1807435339
101 changed files with 4181 additions and 1764 deletions

View File

@@ -10,22 +10,20 @@ from langchain_core.exceptions import LangChainException
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from sqlalchemy import or_
from sqlalchemy.exc import SQLAlchemyError
from common.extensions import db, minio_client
from common.models.document import DocumentVersion, Embedding, Document
from common.extensions import db, minio_client, template_manager
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
from common.models.user import Tenant
from common.utils.celery_utils import current_celery
from common.utils.database import Database
from common.utils.model_utils import select_model_variables, create_language_template
from common.utils.os_utils import safe_remove, sync_folder
from eveai_workers.Processors.audio_processor import AudioProcessor
from eveai_workers.Processors.html_processor import HTMLProcessor
from eveai_workers.Processors.pdf_processor import PDFProcessor
from eveai_workers.Processors.srt_processor import SRTProcessor
from common.utils.model_utils import create_language_template, get_model_variables
from common.utils.business_event import BusinessEvent
from common.utils.business_event_context import current_event
from config.processor_types import PROCESSOR_TYPES
from eveai_workers.processors.processor_registry import ProcessorRegistry
# Healthcheck task
@@ -53,14 +51,18 @@ def create_embeddings(tenant_id, document_version_id):
# Retrieve the Catalog ID
doc = Document.query.get_or_404(document_version.doc_id)
catalog_id = doc.catalog_id
catalog = Catalog.query.get_or_404(catalog_id)
# Select variables to work with depending on tenant and model
model_variables = select_model_variables(tenant, catalog_id=catalog_id)
current_app.logger.debug(f'Model variables: {model_variables}')
model_variables = get_model_variables(tenant_id)
# Define processor related information
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
except Exception as e:
current_app.logger.error(f'Create Embeddings request received '
f'for non existing document version {document_version_id} '
f'for badly configured document version {document_version_id} '
f'for tenant {tenant_id}, '
f'error: {e}')
raise
@@ -90,19 +92,19 @@ def create_embeddings(tenant_id, document_version_id):
delete_embeddings_for_document_version(document_version)
try:
match document_version.file_type:
case 'pdf':
process_pdf(tenant, model_variables, document_version)
case 'html':
process_html(tenant, model_variables, document_version)
case 'srt':
process_srt(tenant, model_variables, document_version)
case 'mp4' | 'mp3' | 'ogg':
process_audio(tenant, model_variables, document_version)
case _:
raise Exception(f'No functionality defined for file type {document_version.file_type} '
f'for tenant {tenant_id} '
f'while creating embeddings for document version {document_version_id}')
with current_event.create_span(f"{processor_type} Processing"):
document_processor = processor_class(
tenant=tenant,
model_variables=model_variables,
document_version=document_version,
catalog=catalog,
processor=processor
)
markdown, title = document_processor.process()
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
current_event.log("Finished Embedding Creation Task")
except Exception as e:
@@ -129,53 +131,12 @@ def delete_embeddings_for_document_version(document_version):
raise
def process_pdf(tenant, model_variables, document_version):
with current_event.create_span("PDF Processing"):
processor = PDFProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, markdown, title)
def process_html(tenant, model_variables, document_version):
with current_event.create_span("HTML Processing"):
processor = HTMLProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, markdown, title)
def process_audio(tenant, model_variables, document_version):
with current_event.create_span("Audio Processing"):
processor = AudioProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, markdown, title)
def process_srt(tenant, model_variables, document_version):
with current_event.create_span("SRT Processing"):
processor = SRTProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, markdown, title)
def embed_markdown(tenant, model_variables, document_version, markdown, title):
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
# Create potential chunks
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
# Combine chunks for embedding
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
# Enrich chunks
with current_event.create_span("Enrich Chunks"):
@@ -203,9 +164,6 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):
def enrich_chunks(tenant, model_variables, document_version, title, chunks):
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
summary = ''
if len(chunks) > 1:
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
@@ -233,18 +191,13 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
enriched_chunk = f'{chunk_total_context}\n{chunk}'
enriched_chunks.append(enriched_chunk)
current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
return enriched_chunks
def summarize_chunk(tenant, model_variables, document_version, chunk):
current_event.log("Starting Summarizing Chunk")
current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}')
llm = model_variables['llm']
template = model_variables['summary_template']
llm = model_variables.get_llm()
template = model_variables.get_template("summary")
language_template = create_language_template(template, document_version.language)
summary_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
@@ -253,11 +206,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
chain = setup | summary_prompt | llm | output_parser
try:
current_app.logger.debug(f'Starting summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}')
summary = chain.invoke({"text": chunk})
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}.')
current_event.log("Finished Summarizing Chunk")
return summary
except LangChainException as e:
@@ -268,14 +217,10 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
def embed_chunks(tenant, model_variables, document_version, chunks):
current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
embedding_model = model_variables['embedding_model']
embedding_model = model_variables.embedding_model
try:
embeddings = embedding_model.embed_documents(chunks)
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
except LangChainException as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} while calling OpenAI API'
@@ -285,28 +230,16 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
# Add embeddings to the database
new_embeddings = []
for chunk, embedding in zip(chunks, embeddings):
new_embedding = model_variables['embedding_db_model']()
new_embedding = model_variables.embedding_model_class()
new_embedding.document_version = document_version
new_embedding.active = True
new_embedding.chunk = chunk
new_embedding.embedding = embedding
new_embeddings.append(new_embedding)
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} ')
return new_embeddings
def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
try:
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
@@ -328,7 +261,6 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits]
current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
return potential_chunks
except Exception as e:
current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
@@ -361,3 +293,69 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
actual_chunks.append(current_chunk)
return actual_chunks
def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: str = None) -> Processor:
"""
Get the appropriate processor for a document based on catalog_id, file_type and optional sub_file_type.
Args:
catalog_id: ID of the catalog
file_type: Type of file (e.g., 'pdf', 'html')
sub_file_type: Optional sub-type for specialized processing
Returns:
Processor instance
Raises:
ValueError: If no matching processor is found
"""
try:
# Start with base query for catalog
query = Processor.query.filter_by(catalog_id=catalog_id)
# Find processor type that handles this file type
matching_processor_type = None
for proc_type, config in PROCESSOR_TYPES.items():
supported_types = config['file_types']
if isinstance(supported_types, str):
supported_types = [t.strip() for t in supported_types.split(',')]
if file_type in supported_types:
matching_processor_type = proc_type
break
if not matching_processor_type:
raise ValueError(f"No processor type found for file type: {file_type}")
# Add processor type condition
query = query.filter_by(type=matching_processor_type)
# If sub_file_type is provided, add that condition
if sub_file_type:
query = query.filter_by(sub_file_type=sub_file_type)
else:
# If no sub_file_type, prefer processors without sub_file_type specification
query = query.filter(or_(Processor.sub_file_type.is_(None),
Processor.sub_file_type == ''))
# Get the first matching processor
processor = query.first()
if not processor:
if sub_file_type:
raise ValueError(
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
f"file type {file_type}, sub-type {sub_file_type}"
)
else:
raise ValueError(
f"No processor found for catalog {catalog_id}, "
f"file type {file_type}"
)
return processor
except Exception as e:
current_app.logger.error(f"Error finding processor: {str(e)}")
raise