- Introduction of the Automatic HTML Processor

- Translation Service improvement - Enable activation / deactivation of Processors - Renew API-keys for Mistral (leading to workspaces) - Align all Document views to use of a session catalog - Allow for different processors for the same file type
2025-06-26 14:38:40 +02:00
parent f5c9542a49
commit fda267b479
35 changed files with 551 additions and 356 deletions
--- a/eveai_workers/processors/init.py
+++ b/eveai_workers/processors/init.py
@@ -1,5 +1,5 @@
 # Import all processor implementations to ensure registration
-from . import audio_processor, html_processor, pdf_processor, markdown_processor, docx_processor
+from . import audio_processor, html_processor, pdf_processor, markdown_processor, docx_processor, automagic_html_processor

 # List of all available processor implementations
-__all__ = ['audio_processor', 'html_processor', 'pdf_processor', 'markdown_processor', 'docx_processor']
+__all__ = ['audio_processor', 'html_processor', 'pdf_processor', 'markdown_processor', 'docx_processor', 'automagic_html_processor']
--- a/eveai_workers/processors/automagic_html_processor.py
+++ b/eveai_workers/processors/automagic_html_processor.py
@@ -0,0 +1,65 @@
+import io
+import pdfplumber
+from flask import current_app
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+import re
+from langchain_core.runnables import RunnablePassthrough
+
+from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
+from common.extensions import minio_client
+from common.utils.model_utils import create_language_template, get_embedding_llm, get_template
+from .base_processor import BaseProcessor
+from common.utils.business_event_context import current_event
+from .processor_registry import ProcessorRegistry
+
+
+class AutomagicHTMLProcessor(BaseProcessor):
+    def __init__(self, tenant, document_version, catalog, processor):
+        super().__init__(tenant, document_version, catalog, processor)
+
+        self.chunk_size = catalog.max_chunk_size
+        self.chunk_overlap = 0
+        self.tuning = self.processor.tuning
+
+        self.prompt_params = {
+            "custom_instructions": self.processor.configuration.get("custom_instructions", ""),
+        }
+        template, llm = get_template("automagic_html_parse")
+
+        translation_prompt = ChatPromptTemplate.from_template(template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+        self.chain = (setup | translation_prompt | llm | output_parser)
+
+
+    def process(self):
+        self._log("Starting Automagic HTML processing")
+        try:
+            # Get HTML-file data
+            file_data = minio_client.download_document_file(
+                self.tenant.id,
+                self.document_version.bucket_name,
+                self.document_version.object_name,
+            )
+
+            # Invoke HTML Processing Agent
+            self.prompt_params["html"] = file_data
+            with current_event.create_span("Markdown Generation"):
+                markdown = self.chain.invoke(self.prompt_params)
+            self._save_markdown(markdown)
+
+            # Retrieve Title
+            match = re.search(r'^# (.+)', markdown, re.MULTILINE)
+            title = match.group(1).strip() if match else None
+
+            self._log("Finished Automagic HTML Processing")
+            return markdown, title
+        except Exception as e:
+            self._log(f"Error automagically processing HTML: {str(e)}", level='error')
+            raise
+
+
+# Register the processor
+ProcessorRegistry.register("AUTOMAGIC_HTML_PROCESSOR", AutomagicHTMLProcessor)
--- a/eveai_workers/processors/pdf_processor.py
+++ b/eveai_workers/processors/pdf_processor.py
@@ -44,185 +44,6 @@ class PDFProcessor(BaseProcessor):
            self._log(f"Error processing PDF: {str(e)}", level='error')
            raise

-    def _extract_content(self, file_data):
-        extracted_content = []
-        with pdfplumber.open(io.BytesIO(file_data)) as pdf:
-            figure_counter = 1
-            for page_num, page in enumerate(pdf.pages):
-                self._log(f"Extracting content from page {page_num + 1}")
-                page_content = {
-                    'text': page.extract_text(),
-                    'figures': self._extract_figures(page, page_num, figure_counter),
-                    'tables': self._extract_tables(page)
-                }
-                self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
-                figure_counter += len(page_content['figures'])
-                extracted_content.append(page_content)
-
-        return extracted_content
-
-    def _extract_figures(self, page, page_num, figure_counter):
-        figures = []
-        # Omit figure processing for now!
-        # for img in page.images:
-        #     try:
-        #         # Try to get the bbox, use full page dimensions if not available
-        #         bbox = img.get('bbox', (0, 0, page.width, page.height))
-        #
-        #         figure = {
-        #             'figure_number': figure_counter,
-        #             'filename': f"figure_{page_num + 1}_{figure_counter}.png",
-        #             'caption': self._find_figure_caption(page, bbox)
-        #         }
-        #
-        #         # Extract the figure as an image
-        #         figure_image = page.within_bbox(bbox).to_image()
-        #
-        #         # Save the figure using MinIO
-        #         with io.BytesIO() as output:
-        #             figure_image.save(output, format='PNG')
-        #             output.seek(0)
-        #             minio_client.upload_document_file(
-        #                 self.tenant.id,
-        #                 self.document_version.doc_id,
-        #                 self.document_version.language,
-        #                 self.document_version.id,
-        #                 figure['filename'],
-        #                 output.getvalue()
-        #             )
-        #
-        #         figures.append(figure)
-        #         figure_counter += 1
-        #     except Exception as e:
-        #         self._log(f"Error processing figure on page {page_num + 1}: {str(e)}", level='error')
-
-        return figures
-
-    def _find_figure_caption(self, page, bbox):
-        try:
-            # Look for text below the figure
-            caption_bbox = (bbox[0], bbox[3], bbox[2], min(bbox[3] + 50, page.height))
-            caption_text = page.crop(caption_bbox).extract_text()
-            if caption_text and caption_text.lower().startswith('figure'):
-                return caption_text
-        except Exception as e:
-            self._log(f"Error finding figure caption: {str(e)}", level='error')
-        return None
-
-    def _extract_tables(self, page):
-        tables = []
-        try:
-            for table in page.extract_tables():
-                if table:
-                    markdown_table = self._table_to_markdown(table)
-                    if markdown_table:  # Only add non-empty tables
-                        tables.append(markdown_table)
-                        self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
-        except Exception as e:
-            self._log(f"Error extracting tables from page: {str(e)}", level='error')
-        return tables
-
-    def _table_to_markdown(self, table):
-        if not table or not table[0]:  # Check if table is empty or first row is empty
-            return ""  # Return empty string for empty tables
-
-        def clean_cell(cell):
-            if cell is None:
-                return ""  # Convert None to empty string
-            return str(cell).replace("|", "\\|")  # Escape pipe characters and convert to string
-
-        header = [clean_cell(cell) for cell in table[0]]
-        markdown = "| " + " | ".join(header) + " |\n"
-        markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
-
-        for row in table[1:]:
-            cleaned_row = [clean_cell(cell) for cell in row]
-            markdown += "| " + " | ".join(cleaned_row) + " |\n"
-
-        return markdown
-
-    def _structure_content(self, extracted_content):
-        structured_content = ""
-        title = "Untitled Document"
-        current_heading_level = 0
-        heading_pattern = re.compile(r'^(\d+(\.\d+)*\.?\s*)?(.+)$')
-
-        def identify_heading(text):
-            match = heading_pattern.match(text.strip())
-            if match:
-                numbering, _, content = match.groups()
-                if numbering:
-                    level = numbering.count('.') + 1
-                    return level, f"{numbering}{content}"
-                else:
-                    return 1, content  # Assume it's a top-level heading if no numbering
-            return 0, text  # Not a heading
-
-        for page in extracted_content:
-            # Assume the title is on the first page
-            if page == extracted_content[0]:
-                lines = page.get('text', '').split('\n')
-                if lines:
-                    title = lines[0].strip()  # Use the first non-empty line as the title
-
-            # Process text
-            paragraphs = page['text'].split('\n\n')
-
-            for para in paragraphs:
-                lines = para.strip().split('\n')
-                if len(lines) == 1:  # Potential heading
-                    level, text = identify_heading(lines[0])
-                    if level > 0:
-                        heading_marks = '#' * level
-                        structured_content += f"\n\n{heading_marks} {text}\n\n"
-                        if level == 1 and not title:
-                            title = text  # Use the first top-level heading as the title if not set
-                    else:
-                        structured_content += f"{para}\n\n"  # Treat as normal paragraph
-                else:
-                    structured_content += f"{para}\n\n"  # Multi-line paragraph
-
-            # Process figures
-            for figure in page.get('figures', []):
-                structured_content += f"\n\n![Figure {figure['figure_number']}]({figure['filename']})\n\n"
-                if figure['caption']:
-                    structured_content += f"*Figure {figure['figure_number']}: {figure['caption']}*\n\n"
-
-            # Add tables
-            if 'tables' in page:
-                for table in page['tables']:
-                    structured_content += f"\n{table}\n"
-
-        if self.tuning:
-            self._save_intermediate(structured_content, "structured_content.md")
-
-        return structured_content, title
-
-    def _split_content_for_llm(self, content):
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=self.chunk_size,
-            chunk_overlap=self.chunk_overlap,
-            length_function=len,
-            separators=["\n\n", "\n", " ", ""]
-        )
-        return text_splitter.split_text(content)
-
-    def _process_chunks_with_llm(self, chunks):
-        template, llm = get_template('pdf_parse')
-        pdf_prompt = ChatPromptTemplate.from_template(template)
-        setup = RunnablePassthrough()
-        output_parser = StrOutputParser()
-        chain = setup | pdf_prompt | llm | output_parser
-
-        markdown_chunks = []
-        for chunk in chunks:
-            input = {"pdf_content": chunk}
-            result = chain.invoke(input)
-            result = self._clean_markdown(result)
-            markdown_chunks.append(result)
-
-        return "\n\n".join(markdown_chunks)
-

 # Register the processor
 ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -11,6 +11,7 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 from sqlalchemy import or_
 from sqlalchemy.exc import SQLAlchemyError
+import traceback

 from common.extensions import db, cache_manager
 from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
@@ -24,7 +25,8 @@ from common.utils.business_event_context import current_event
 from config.type_defs.processor_types import PROCESSOR_TYPES
 from eveai_workers.processors.processor_registry import ProcessorRegistry

-from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
+from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel, EveAINoContentFound, EveAIUnsupportedFileType, \
+    EveAINoProcessorFound

 from common.utils.config_field_types import json_to_pattern_list

@@ -58,8 +60,8 @@ def create_embeddings(tenant_id, document_version_id):
        catalog = Catalog.query.get_or_404(catalog_id)

        # Define processor related information
-        processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
        processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
+        processor_class = ProcessorRegistry.get_processor_class(processor.type)

    except Exception as e:
        current_app.logger.error(f'Create Embeddings request received '
@@ -95,7 +97,7 @@ def create_embeddings(tenant_id, document_version_id):
        delete_embeddings_for_document_version(document_version)

        try:
-            with current_event.create_span(f"{processor_type} Processing"):
+            with current_event.create_span(f"{processor.type} Processing"):
                document_processor = processor_class(
                    tenant=tenant,
                    document_version=document_version,
@@ -107,6 +109,8 @@ def create_embeddings(tenant_id, document_version_id):
                    'markdown': markdown,
                    'title': title
                })
+                if not markdown or markdown.strip() == '':
+                    raise EveAINoContentFound(document_version.doc_id, document_version.id)

            with current_event.create_span("Embedding"):
                embed_markdown(tenant, document_version, catalog, document_processor, markdown, title)
@@ -114,9 +118,11 @@ def create_embeddings(tenant_id, document_version_id):
            current_event.log("Finished Embedding Creation Task")

        except Exception as e:
+            stacktrace = traceback.format_exc()
            current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
-                                     f'on document version {document_version_id} '
-                                     f'error: {e}')
+                                 f'on document version {document_version_id} '
+                                 f'error: {e}\n'
+                                 f'Stacktrace: {stacktrace}')
            document_version.processing = False
            document_version.processing_finished_at = dt.now(tz.utc)
            document_version.processing_error = str(e)[:255]
@@ -624,25 +630,9 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
        ValueError: If no matching processor is found
    """
    try:
+        current_app.logger.debug(f"Getting processor for catalog {catalog_id}, file type {file_type}, file sub_type {sub_file_type} ")
        # Start with base query for catalog
-        query = Processor.query.filter_by(catalog_id=catalog_id)
-
-        # Find processor type that handles this file type
-        matching_processor_type = None
-        for proc_type, config in PROCESSOR_TYPES.items():
-            supported_types = config['file_types']
-            if isinstance(supported_types, str):
-                supported_types = [t.strip() for t in supported_types.split(',')]
-
-            if file_type in supported_types:
-                matching_processor_type = proc_type
-                break
-
-        if not matching_processor_type:
-            raise ValueError(f"No processor type found for file type: {file_type}")
-
-        # Add processor type condition
-        query = query.filter_by(type=matching_processor_type)
+        query = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True)

        # If sub_file_type is provided, add that condition
        if sub_file_type:
@@ -651,22 +641,44 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
            # If no sub_file_type, prefer processors without sub_file_type specification
            query = query.filter(or_(Processor.sub_file_type.is_(None),
                                     Processor.sub_file_type == ''))
+        
+        available_processors = query.all()

-        # Get the first matching processor
-        processor = query.first()
+        if not available_processors:
+            raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
+        available_processor_types = [processor.type for processor in available_processors]
+        current_app.logger.debug(f"Available processors for catalog {catalog_id}: {available_processor_types}")
+
+        # Find processor type that handles this file type
+        matching_processor_type = None
+        for proc_type, config in PROCESSOR_TYPES.items():
+            # Alleen verwerken als dit type processor beschikbaar is in de database
+            if proc_type in available_processor_types:
+                supported_types = config['file_types']
+                if isinstance(supported_types, str):
+                    supported_types = [t.strip() for t in supported_types.split(',')]
+                    current_app.logger.debug(f"Supported types for processor type {proc_type}: {supported_types}")
+
+                if file_type in supported_types:
+                    matching_processor_type = proc_type
+                    break
+
+        current_app.logger.debug(f"Processor type found for catalog {catalog_id}, file type {file_type}: {matching_processor_type}")
+        if not matching_processor_type:
+            raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
+        else:
+            current_app.logger.debug(f"Processor type found for file type: {file_type}: {matching_processor_type}")
+
+        processor = None
+        for proc in available_processors:
+            if proc.type == matching_processor_type:
+                processor = proc
+                break

        if not processor:
-            if sub_file_type:
-                raise ValueError(
-                    f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
-                    f"file type {file_type}, sub-type {sub_file_type}"
-                )
-            else:
-                raise ValueError(
-                    f"No processor found for catalog {catalog_id}, "
-                    f"file type {file_type}"
-                )
+            raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)

+        current_app.logger.debug(f"Processor found for catalog {catalog_id}, file type {file_type}: {processor}")
        return processor

    except Exception as e: