- Significantly changed the PDF Processor to use Mistral's OCR model

- ensure very long chunks get split into smaller chunks
- ensure TrackedMistralAIEmbedding is batched if needed to ensure correct execution
- upgraded some of the packages to a higher version
This commit is contained in:
Josako
2025-04-16 15:39:16 +02:00
parent 5f58417d24
commit 4bf12db142
10 changed files with 518 additions and 91 deletions

View File

@@ -7,6 +7,7 @@ from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
from common.extensions import minio_client
from common.utils.model_utils import create_language_template, get_embedding_llm
from .base_processor import BaseProcessor
@@ -21,6 +22,7 @@ class PDFProcessor(BaseProcessor):
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
self.ocr_client = TrackedMistralOcrClient()
def process(self):
self._log("Starting PDF processing")
@@ -30,14 +32,10 @@ class PDFProcessor(BaseProcessor):
self.document_version.bucket_name,
self.document_version.object_name,
)
with current_event.create_span("PDF Extraction"):
extracted_content = self._extract_content(file_data)
structured_content, title = self._structure_content(extracted_content)
file_name = f"{self.document_version.bucket_name}_{self.document_version.object_name.replace("/", "_")}"
with current_event.create_span("Markdown Generation"):
llm_chunks = self._split_content_for_llm(structured_content)
markdown = self._process_chunks_with_llm(llm_chunks)
markdown, title = self.ocr_client.process_pdf(file_name, file_data)
self._save_markdown(markdown)
self._log("Finished processing PDF")