- Introduction of the Automatic HTML Processor

- Translation Service improvement - Enable activation / deactivation of Processors - Renew API-keys for Mistral (leading to workspaces) - Align all Document views to use of a session catalog - Allow for different processors for the same file type
2025-06-26 14:38:40 +02:00
parent f5c9542a49
commit fda267b479
35 changed files with 551 additions and 356 deletions
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -11,6 +11,7 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 from sqlalchemy import or_
 from sqlalchemy.exc import SQLAlchemyError
+import traceback

 from common.extensions import db, cache_manager
 from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
@@ -24,7 +25,8 @@ from common.utils.business_event_context import current_event
 from config.type_defs.processor_types import PROCESSOR_TYPES
 from eveai_workers.processors.processor_registry import ProcessorRegistry

-from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
+from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel, EveAINoContentFound, EveAIUnsupportedFileType, \
+    EveAINoProcessorFound

 from common.utils.config_field_types import json_to_pattern_list

@@ -58,8 +60,8 @@ def create_embeddings(tenant_id, document_version_id):
        catalog = Catalog.query.get_or_404(catalog_id)

        # Define processor related information
-        processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
        processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
+        processor_class = ProcessorRegistry.get_processor_class(processor.type)

    except Exception as e:
        current_app.logger.error(f'Create Embeddings request received '
@@ -95,7 +97,7 @@ def create_embeddings(tenant_id, document_version_id):
        delete_embeddings_for_document_version(document_version)

        try:
-            with current_event.create_span(f"{processor_type} Processing"):
+            with current_event.create_span(f"{processor.type} Processing"):
                document_processor = processor_class(
                    tenant=tenant,
                    document_version=document_version,
@@ -107,6 +109,8 @@ def create_embeddings(tenant_id, document_version_id):
                    'markdown': markdown,
                    'title': title
                })
+                if not markdown or markdown.strip() == '':
+                    raise EveAINoContentFound(document_version.doc_id, document_version.id)

            with current_event.create_span("Embedding"):
                embed_markdown(tenant, document_version, catalog, document_processor, markdown, title)
@@ -114,9 +118,11 @@ def create_embeddings(tenant_id, document_version_id):
            current_event.log("Finished Embedding Creation Task")

        except Exception as e:
+            stacktrace = traceback.format_exc()
            current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
-                                     f'on document version {document_version_id} '
-                                     f'error: {e}')
+                                 f'on document version {document_version_id} '
+                                 f'error: {e}\n'
+                                 f'Stacktrace: {stacktrace}')
            document_version.processing = False
            document_version.processing_finished_at = dt.now(tz.utc)
            document_version.processing_error = str(e)[:255]
@@ -624,25 +630,9 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
        ValueError: If no matching processor is found
    """
    try:
+        current_app.logger.debug(f"Getting processor for catalog {catalog_id}, file type {file_type}, file sub_type {sub_file_type} ")
        # Start with base query for catalog
-        query = Processor.query.filter_by(catalog_id=catalog_id)
-
-        # Find processor type that handles this file type
-        matching_processor_type = None
-        for proc_type, config in PROCESSOR_TYPES.items():
-            supported_types = config['file_types']
-            if isinstance(supported_types, str):
-                supported_types = [t.strip() for t in supported_types.split(',')]
-
-            if file_type in supported_types:
-                matching_processor_type = proc_type
-                break
-
-        if not matching_processor_type:
-            raise ValueError(f"No processor type found for file type: {file_type}")
-
-        # Add processor type condition
-        query = query.filter_by(type=matching_processor_type)
+        query = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True)

        # If sub_file_type is provided, add that condition
        if sub_file_type:
@@ -651,22 +641,44 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
            # If no sub_file_type, prefer processors without sub_file_type specification
            query = query.filter(or_(Processor.sub_file_type.is_(None),
                                     Processor.sub_file_type == ''))
+        
+        available_processors = query.all()

-        # Get the first matching processor
-        processor = query.first()
+        if not available_processors:
+            raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
+        available_processor_types = [processor.type for processor in available_processors]
+        current_app.logger.debug(f"Available processors for catalog {catalog_id}: {available_processor_types}")
+
+        # Find processor type that handles this file type
+        matching_processor_type = None
+        for proc_type, config in PROCESSOR_TYPES.items():
+            # Alleen verwerken als dit type processor beschikbaar is in de database
+            if proc_type in available_processor_types:
+                supported_types = config['file_types']
+                if isinstance(supported_types, str):
+                    supported_types = [t.strip() for t in supported_types.split(',')]
+                    current_app.logger.debug(f"Supported types for processor type {proc_type}: {supported_types}")
+
+                if file_type in supported_types:
+                    matching_processor_type = proc_type
+                    break
+
+        current_app.logger.debug(f"Processor type found for catalog {catalog_id}, file type {file_type}: {matching_processor_type}")
+        if not matching_processor_type:
+            raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
+        else:
+            current_app.logger.debug(f"Processor type found for file type: {file_type}: {matching_processor_type}")
+
+        processor = None
+        for proc in available_processors:
+            if proc.type == matching_processor_type:
+                processor = proc
+                break

        if not processor:
-            if sub_file_type:
-                raise ValueError(
-                    f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
-                    f"file type {file_type}, sub-type {sub_file_type}"
-                )
-            else:
-                raise ValueError(
-                    f"No processor found for catalog {catalog_id}, "
-                    f"file type {file_type}"
-                )
+            raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)

+        current_app.logger.debug(f"Processor found for catalog {catalog_id}, file type {file_type}: {processor}")
        return processor

    except Exception as e: