- Significantly changed the PDF Processor to use Mistral's OCR model
- ensure very long chunks get split into smaller chunks - ensure TrackedMistralAIEmbedding is batched if needed to ensure correct execution - upgraded some of the packages to a higher version
This commit is contained in:
53
common/eveai_model/tracked_mistral_ocr_client.py
Normal file
53
common/eveai_model/tracked_mistral_ocr_client.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import re
|
||||
import time
|
||||
|
||||
from flask import current_app
|
||||
from mistralai import Mistral
|
||||
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class TrackedMistralOcrClient:
|
||||
def __init__(self):
|
||||
api_key = current_app.config['MISTRAL_API_KEY']
|
||||
self.client = Mistral(
|
||||
api_key=api_key,
|
||||
)
|
||||
self.model = "mistral-ocr-latest"
|
||||
|
||||
def _get_title(self, markdown):
|
||||
# Look for the first level-1 heading
|
||||
match = re.search(r'^# (.+)', markdown, re.MULTILINE)
|
||||
return match.group(1).strip() if match else None
|
||||
|
||||
def process_pdf(self, file_name, file_content):
|
||||
start_time = time.time()
|
||||
uploaded_pdf = self.client.files.upload(
|
||||
file={
|
||||
"file_name": file_name,
|
||||
"content": file_content
|
||||
},
|
||||
purpose="ocr"
|
||||
)
|
||||
signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id)
|
||||
ocr_response = self.client.ocr.process(
|
||||
model=self.model,
|
||||
document={
|
||||
"type": "document_url",
|
||||
"document_url": signed_url.url
|
||||
},
|
||||
include_image_base64=False
|
||||
)
|
||||
nr_of_pages = len(ocr_response.pages)
|
||||
all_markdown = " ".join(page.markdown for page in ocr_response.pages)
|
||||
title = self._get_title(all_markdown)
|
||||
end_time = time.time()
|
||||
|
||||
metrics = {
|
||||
'nr_of_pages': nr_of_pages,
|
||||
'time_elapsed': end_time - start_time,
|
||||
'interaction_type': 'OCR',
|
||||
}
|
||||
current_event.log_llm_metrics(metrics)
|
||||
|
||||
return all_markdown, title
|
||||
Reference in New Issue
Block a user