Files
eveAI/common/eveai_model/tracked_mistral_ocr_client.py
Josako 4bf12db142 - Significantly changed the PDF Processor to use Mistral's OCR model
- ensure very long chunks get split into smaller chunks
- ensure TrackedMistralAIEmbedding is batched if needed to ensure correct execution
- upgraded some of the packages to a higher version
2025-04-16 15:39:16 +02:00

54 lines
1.6 KiB
Python

import re
import time
from flask import current_app
from mistralai import Mistral
from common.utils.business_event_context import current_event
class TrackedMistralOcrClient:
def __init__(self):
api_key = current_app.config['MISTRAL_API_KEY']
self.client = Mistral(
api_key=api_key,
)
self.model = "mistral-ocr-latest"
def _get_title(self, markdown):
# Look for the first level-1 heading
match = re.search(r'^# (.+)', markdown, re.MULTILINE)
return match.group(1).strip() if match else None
def process_pdf(self, file_name, file_content):
start_time = time.time()
uploaded_pdf = self.client.files.upload(
file={
"file_name": file_name,
"content": file_content
},
purpose="ocr"
)
signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id)
ocr_response = self.client.ocr.process(
model=self.model,
document={
"type": "document_url",
"document_url": signed_url.url
},
include_image_base64=False
)
nr_of_pages = len(ocr_response.pages)
all_markdown = " ".join(page.markdown for page in ocr_response.pages)
title = self._get_title(all_markdown)
end_time = time.time()
metrics = {
'nr_of_pages': nr_of_pages,
'time_elapsed': end_time - start_time,
'interaction_type': 'OCR',
}
current_event.log_llm_metrics(metrics)
return all_markdown, title