49 lines
1.9 KiB
Python
49 lines
1.9 KiB
Python
import io
|
|
from flask import current_app
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_core.output_parsers import StrOutputParser
|
|
from langchain_core.prompts import ChatPromptTemplate
|
|
import re
|
|
from langchain_core.runnables import RunnablePassthrough
|
|
|
|
from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
|
|
from common.extensions import minio_client
|
|
from common.utils.model_utils import create_language_template, get_embedding_llm, get_template
|
|
from .base_processor import BaseProcessor
|
|
from common.utils.business_event_context import current_event
|
|
from .processor_registry import ProcessorRegistry
|
|
|
|
|
|
class PDFProcessor(BaseProcessor):
|
|
def __init__(self, tenant, document_version, catalog, processor):
|
|
super().__init__(tenant, document_version, catalog, processor)
|
|
|
|
self.chunk_size = catalog.max_chunk_size
|
|
self.chunk_overlap = 0
|
|
self.tuning = self.processor.tuning
|
|
self.ocr_client = TrackedMistralOcrClient()
|
|
|
|
def process(self):
|
|
self._log("Starting PDF processing")
|
|
try:
|
|
file_data = minio_client.download_document_file(
|
|
self.tenant.id,
|
|
self.document_version.bucket_name,
|
|
self.document_version.object_name,
|
|
)
|
|
file_name = f"{self.document_version.bucket_name}_{self.document_version.object_name.replace("/", "_")}"
|
|
|
|
with current_event.create_span("Markdown Generation"):
|
|
markdown, title = self.ocr_client.process_pdf(file_name, file_data)
|
|
|
|
self._save_markdown(markdown)
|
|
self._log("Finished processing PDF")
|
|
return markdown, title
|
|
except Exception as e:
|
|
self._log(f"Error processing PDF: {str(e)}", level='error')
|
|
raise
|
|
|
|
|
|
# Register the processor
|
|
ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)
|