- Significantly changed the PDF Processor to use Mistral's OCR model

- ensure very long chunks get split into smaller chunks - ensure TrackedMistralAIEmbedding is batched if needed to ensure correct execution - upgraded some of the packages to a higher version
2025-04-16 15:39:16 +02:00
parent 5f58417d24
commit 4bf12db142
10 changed files with 518 additions and 91 deletions
--- a/eveai_workers/processors/pdf_processor.py
+++ b/eveai_workers/processors/pdf_processor.py
@@ -7,6 +7,7 @@ from langchain_core.prompts import ChatPromptTemplate
 import re
 from langchain_core.runnables import RunnablePassthrough

+from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
 from common.extensions import minio_client
 from common.utils.model_utils import create_language_template, get_embedding_llm
 from .base_processor import BaseProcessor
@@ -21,6 +22,7 @@ class PDFProcessor(BaseProcessor):
        self.chunk_size = catalog.max_chunk_size
        self.chunk_overlap = 0
        self.tuning = self.processor.tuning
+        self.ocr_client = TrackedMistralOcrClient()

    def process(self):
        self._log("Starting PDF processing")
@@ -30,14 +32,10 @@ class PDFProcessor(BaseProcessor):
                self.document_version.bucket_name,
                self.document_version.object_name,
            )
-
-            with current_event.create_span("PDF Extraction"):
-                extracted_content = self._extract_content(file_data)
-                structured_content, title = self._structure_content(extracted_content)
+            file_name = f"{self.document_version.bucket_name}_{self.document_version.object_name.replace("/", "_")}"

            with current_event.create_span("Markdown Generation"):
-                llm_chunks = self._split_content_for_llm(structured_content)
-                markdown = self._process_chunks_with_llm(llm_chunks)
+                markdown, title = self.ocr_client.process_pdf(file_name, file_data)

            self._save_markdown(markdown)
            self._log("Finished processing PDF")