import re import time from flask import current_app from mistralai import Mistral from common.utils.business_event_context import current_event class TrackedMistralOcrClient: def __init__(self): api_key = current_app.config['MISTRAL_API_KEY'] self.client = Mistral( api_key=api_key, ) self.model = "mistral-ocr-latest" def _get_title(self, markdown): # Look for the first level-1 heading match = re.search(r'^# (.+)', markdown, re.MULTILINE) return match.group(1).strip() if match else None def process_pdf(self, file_name, file_content): start_time = time.time() uploaded_pdf = self.client.files.upload( file={ "file_name": file_name, "content": file_content }, purpose="ocr" ) signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id) ocr_response = self.client.ocr.process( model=self.model, document={ "type": "document_url", "document_url": signed_url.url }, include_image_base64=False ) nr_of_pages = len(ocr_response.pages) all_markdown = " ".join(page.markdown for page in ocr_response.pages) title = self._get_title(all_markdown) end_time = time.time() metrics = { 'nr_of_pages': nr_of_pages, 'time_elapsed': end_time - start_time, 'interaction_type': 'OCR', } current_event.log_llm_metrics(metrics) return all_markdown, title