eveAI/common/eveai_model/tracked_mistral_ocr_client.py

import re
import time

from flask import current_app
from mistralai import Mistral

from common.utils.business_event_context import current_event


class TrackedMistralOcrClient:
    def __init__(self):
        api_key = current_app.config['MISTRAL_API_KEY']
        self.client = Mistral(
            api_key=api_key,
        )
        self.model = "mistral-ocr-latest"

    def _get_title(self, markdown):
        # Look for the first level-1 heading
        match = re.search(r'^# (.+)', markdown, re.MULTILINE)
        return match.group(1).strip() if match else None

    def process_pdf(self, file_name, file_content):
        start_time = time.time()
        uploaded_pdf = self.client.files.upload(
            file={
                "file_name": file_name,
                "content": file_content
            },
            purpose="ocr"
        )
        signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id)
        ocr_response = self.client.ocr.process(
            model=self.model,
            document={
                "type": "document_url",
                "document_url": signed_url.url
            },
            include_image_base64=False
        )
        nr_of_pages = len(ocr_response.pages)
        all_markdown = " ".join(page.markdown for page in ocr_response.pages)
        title = self._get_title(all_markdown)
        end_time = time.time()

        metrics = {
            'nr_of_pages': nr_of_pages,
            'time_elapsed': end_time - start_time,
            'interaction_type': 'OCR',
        }
        current_event.log_llm_metrics(metrics)

        return all_markdown, title