- Introduction of dynamic Retrievers & Specialists

- Introduction of dynamic Processors - Introduction of caching system - Introduction of a better template manager - Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists - Start adaptation of chat client
2024-11-15 10:00:53 +01:00
parent 55a8a95f79
commit 1807435339
101 changed files with 4181 additions and 1764 deletions
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -10,22 +10,20 @@ from langchain_core.exceptions import LangChainException
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
+from sqlalchemy import or_
 from sqlalchemy.exc import SQLAlchemyError

-from common.extensions import db, minio_client
-from common.models.document import DocumentVersion, Embedding, Document
+from common.extensions import db, minio_client, template_manager
+from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
 from common.models.user import Tenant
 from common.utils.celery_utils import current_celery
 from common.utils.database import Database
-from common.utils.model_utils import select_model_variables, create_language_template
-from common.utils.os_utils import safe_remove, sync_folder
-from eveai_workers.Processors.audio_processor import AudioProcessor
-from eveai_workers.Processors.html_processor import HTMLProcessor
-from eveai_workers.Processors.pdf_processor import PDFProcessor
-from eveai_workers.Processors.srt_processor import SRTProcessor
+from common.utils.model_utils import create_language_template, get_model_variables

 from common.utils.business_event import BusinessEvent
 from common.utils.business_event_context import current_event
+from config.processor_types import PROCESSOR_TYPES
+from eveai_workers.processors.processor_registry import ProcessorRegistry


 # Healthcheck task
@@ -53,14 +51,18 @@ def create_embeddings(tenant_id, document_version_id):
        # Retrieve the Catalog ID
        doc = Document.query.get_or_404(document_version.doc_id)
        catalog_id = doc.catalog_id
+        catalog = Catalog.query.get_or_404(catalog_id)

        # Select variables to work with depending on tenant and model
-        model_variables = select_model_variables(tenant, catalog_id=catalog_id)
-        current_app.logger.debug(f'Model variables: {model_variables}')
+        model_variables = get_model_variables(tenant_id)
+
+        # Define processor related information
+        processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
+        processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)

    except Exception as e:
        current_app.logger.error(f'Create Embeddings request received '
-                                 f'for non existing document version {document_version_id} '
+                                 f'for badly configured document version {document_version_id} '
                                 f'for tenant {tenant_id}, '
                                 f'error: {e}')
        raise
@@ -90,19 +92,19 @@ def create_embeddings(tenant_id, document_version_id):
        delete_embeddings_for_document_version(document_version)

        try:
-            match document_version.file_type:
-                case 'pdf':
-                    process_pdf(tenant, model_variables, document_version)
-                case 'html':
-                    process_html(tenant, model_variables, document_version)
-                case 'srt':
-                    process_srt(tenant, model_variables, document_version)
-                case 'mp4' | 'mp3' | 'ogg':
-                    process_audio(tenant, model_variables, document_version)
-                case _:
-                    raise Exception(f'No functionality defined for file type {document_version.file_type} '
-                                    f'for tenant {tenant_id} '
-                                    f'while creating embeddings for document version {document_version_id}')
+            with current_event.create_span(f"{processor_type} Processing"):
+                document_processor = processor_class(
+                    tenant=tenant,
+                    model_variables=model_variables,
+                    document_version=document_version,
+                    catalog=catalog,
+                    processor=processor
+                )
+                markdown, title = document_processor.process()
+
+            with current_event.create_span("Embedding"):
+                embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
+
            current_event.log("Finished Embedding Creation Task")

        except Exception as e:
@@ -129,53 +131,12 @@ def delete_embeddings_for_document_version(document_version):
        raise


-def process_pdf(tenant, model_variables, document_version):
-    with current_event.create_span("PDF Processing"):
-        processor = PDFProcessor(tenant, model_variables, document_version)
-        markdown, title = processor.process()
-
-    # Process markdown and embed
-    with current_event.create_span("Embedding"):
-        embed_markdown(tenant, model_variables, document_version, markdown, title)
-
-
-def process_html(tenant, model_variables, document_version):
-    with current_event.create_span("HTML Processing"):
-        processor = HTMLProcessor(tenant, model_variables, document_version)
-        markdown, title = processor.process()
-
-    # Process markdown and embed
-    with current_event.create_span("Embedding"):
-        embed_markdown(tenant, model_variables, document_version, markdown, title)
-
-
-def process_audio(tenant, model_variables, document_version):
-    with current_event.create_span("Audio Processing"):
-        processor = AudioProcessor(tenant, model_variables, document_version)
-        markdown, title = processor.process()
-
-    # Process markdown and embed
-    with current_event.create_span("Embedding"):
-        embed_markdown(tenant, model_variables, document_version, markdown, title)
-
-
-def process_srt(tenant, model_variables, document_version):
-    with current_event.create_span("SRT Processing"):
-        processor = SRTProcessor(tenant, model_variables, document_version)
-        markdown, title = processor.process()
-
-    # Process markdown and embed
-    with current_event.create_span("Embedding"):
-        embed_markdown(tenant, model_variables, document_version, markdown, title)
-
-
-def embed_markdown(tenant, model_variables, document_version, markdown, title):
+def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
    # Create potential chunks
    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")

    # Combine chunks for embedding
-    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
-                                         model_variables['max_chunk_size'])
+    chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)

    # Enrich chunks
    with current_event.create_span("Enrich Chunks"):
@@ -203,9 +164,6 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):


 def enrich_chunks(tenant, model_variables, document_version, title, chunks):
-    current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-
    summary = ''
    if len(chunks) > 1:
        summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
@@ -233,18 +191,13 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
        enriched_chunk = f'{chunk_total_context}\n{chunk}'
        enriched_chunks.append(enriched_chunk)

-    current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-
    return enriched_chunks


 def summarize_chunk(tenant, model_variables, document_version, chunk):
    current_event.log("Starting Summarizing Chunk")
-    current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-    llm = model_variables['llm']
-    template = model_variables['summary_template']
+    llm = model_variables.get_llm()
+    template = model_variables.get_template("summary")
    language_template = create_language_template(template, document_version.language)
    summary_prompt = ChatPromptTemplate.from_template(language_template)
    setup = RunnablePassthrough()
@@ -253,11 +206,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
    chain = setup | summary_prompt | llm | output_parser

    try:
-        current_app.logger.debug(f'Starting summarizing chunk for tenant {tenant.id} '
-                                 f'on document version {document_version.id}')
        summary = chain.invoke({"text": chunk})
-        current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
-                                 f'on document version {document_version.id}.')
        current_event.log("Finished Summarizing Chunk")
        return summary
    except LangChainException as e:
@@ -268,14 +217,10 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):


 def embed_chunks(tenant, model_variables, document_version, chunks):
-    current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-    embedding_model = model_variables['embedding_model']
+    embedding_model = model_variables.embedding_model

    try:
        embeddings = embedding_model.embed_documents(chunks)
-        current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} '
-                                 f'on document version {document_version.id}')
    except LangChainException as e:
        current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
                                 f'on document version {document_version.id} while calling OpenAI API'
@@ -285,28 +230,16 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
    # Add embeddings to the database
    new_embeddings = []
    for chunk, embedding in zip(chunks, embeddings):
-        new_embedding = model_variables['embedding_db_model']()
+        new_embedding = model_variables.embedding_model_class()
        new_embedding.document_version = document_version
        new_embedding.active = True
        new_embedding.chunk = chunk
        new_embedding.embedding = embedding
        new_embeddings.append(new_embedding)

-    current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} ')
-
    return new_embeddings


-def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
-    if tenant.embed_tuning:
-        current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
-        current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
-        current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
-        current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
-        current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
-        current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
-
-
 def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
    try:
        current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
@@ -328,7 +261,6 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
        md_header_splits = markdown_splitter.split_text(markdown)
        potential_chunks = [doc.page_content for doc in md_header_splits]

-        current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
        return potential_chunks
    except Exception as e:
        current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
@@ -361,3 +293,69 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
        actual_chunks.append(current_chunk)

    return actual_chunks
+
+
+def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: str = None) -> Processor:
+    """
+    Get the appropriate processor for a document based on catalog_id, file_type and optional sub_file_type.
+
+    Args:
+        catalog_id: ID of the catalog
+        file_type: Type of file (e.g., 'pdf', 'html')
+        sub_file_type: Optional sub-type for specialized processing
+
+    Returns:
+        Processor instance
+
+    Raises:
+        ValueError: If no matching processor is found
+    """
+    try:
+        # Start with base query for catalog
+        query = Processor.query.filter_by(catalog_id=catalog_id)
+
+        # Find processor type that handles this file type
+        matching_processor_type = None
+        for proc_type, config in PROCESSOR_TYPES.items():
+            supported_types = config['file_types']
+            if isinstance(supported_types, str):
+                supported_types = [t.strip() for t in supported_types.split(',')]
+
+            if file_type in supported_types:
+                matching_processor_type = proc_type
+                break
+
+        if not matching_processor_type:
+            raise ValueError(f"No processor type found for file type: {file_type}")
+
+        # Add processor type condition
+        query = query.filter_by(type=matching_processor_type)
+
+        # If sub_file_type is provided, add that condition
+        if sub_file_type:
+            query = query.filter_by(sub_file_type=sub_file_type)
+        else:
+            # If no sub_file_type, prefer processors without sub_file_type specification
+            query = query.filter(or_(Processor.sub_file_type.is_(None),
+                                     Processor.sub_file_type == ''))
+
+        # Get the first matching processor
+        processor = query.first()
+
+        if not processor:
+            if sub_file_type:
+                raise ValueError(
+                    f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
+                    f"file type {file_type}, sub-type {sub_file_type}"
+                )
+            else:
+                raise ValueError(
+                    f"No processor found for catalog {catalog_id}, "
+                    f"file type {file_type}"
+                )
+
+        return processor
+
+    except Exception as e:
+        current_app.logger.error(f"Error finding processor: {str(e)}")
+        raise