- ensure very long chunks get split into smaller chunks - ensure TrackedMistralAIEmbedding is batched if needed to ensure correct execution - upgraded some of the packages to a higher version
54 lines
1.6 KiB
Python
54 lines
1.6 KiB
Python
import re
|
|
import time
|
|
|
|
from flask import current_app
|
|
from mistralai import Mistral
|
|
|
|
from common.utils.business_event_context import current_event
|
|
|
|
|
|
class TrackedMistralOcrClient:
|
|
def __init__(self):
|
|
api_key = current_app.config['MISTRAL_API_KEY']
|
|
self.client = Mistral(
|
|
api_key=api_key,
|
|
)
|
|
self.model = "mistral-ocr-latest"
|
|
|
|
def _get_title(self, markdown):
|
|
# Look for the first level-1 heading
|
|
match = re.search(r'^# (.+)', markdown, re.MULTILINE)
|
|
return match.group(1).strip() if match else None
|
|
|
|
def process_pdf(self, file_name, file_content):
|
|
start_time = time.time()
|
|
uploaded_pdf = self.client.files.upload(
|
|
file={
|
|
"file_name": file_name,
|
|
"content": file_content
|
|
},
|
|
purpose="ocr"
|
|
)
|
|
signed_url = self.client.files.get_signed_url(file_id=uploaded_pdf.id)
|
|
ocr_response = self.client.ocr.process(
|
|
model=self.model,
|
|
document={
|
|
"type": "document_url",
|
|
"document_url": signed_url.url
|
|
},
|
|
include_image_base64=False
|
|
)
|
|
nr_of_pages = len(ocr_response.pages)
|
|
all_markdown = " ".join(page.markdown for page in ocr_response.pages)
|
|
title = self._get_title(all_markdown)
|
|
end_time = time.time()
|
|
|
|
metrics = {
|
|
'nr_of_pages': nr_of_pages,
|
|
'time_elapsed': end_time - start_time,
|
|
'interaction_type': 'OCR',
|
|
}
|
|
current_event.log_llm_metrics(metrics)
|
|
|
|
return all_markdown, title
|