- Significantly changed the PDF Processor to use Mistral's OCR model
- ensure very long chunks get split into smaller chunks - ensure TrackedMistralAIEmbedding is batched if needed to ensure correct execution - upgraded some of the packages to a higher version
This commit is contained in:
@@ -7,6 +7,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
||||
import re
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template, get_embedding_llm
|
||||
from .base_processor import BaseProcessor
|
||||
@@ -21,6 +22,7 @@ class PDFProcessor(BaseProcessor):
|
||||
self.chunk_size = catalog.max_chunk_size
|
||||
self.chunk_overlap = 0
|
||||
self.tuning = self.processor.tuning
|
||||
self.ocr_client = TrackedMistralOcrClient()
|
||||
|
||||
def process(self):
|
||||
self._log("Starting PDF processing")
|
||||
@@ -30,14 +32,10 @@ class PDFProcessor(BaseProcessor):
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
|
||||
with current_event.create_span("PDF Extraction"):
|
||||
extracted_content = self._extract_content(file_data)
|
||||
structured_content, title = self._structure_content(extracted_content)
|
||||
file_name = f"{self.document_version.bucket_name}_{self.document_version.object_name.replace("/", "_")}"
|
||||
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
llm_chunks = self._split_content_for_llm(structured_content)
|
||||
markdown = self._process_chunks_with_llm(llm_chunks)
|
||||
markdown, title = self.ocr_client.process_pdf(file_name, file_data)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing PDF")
|
||||
|
||||
Reference in New Issue
Block a user