import io from flask import current_app from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate import re from langchain_core.runnables import RunnablePassthrough from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient from common.extensions import minio_client from common.utils.model_utils import create_language_template, get_embedding_llm, get_template from .base_processor import BaseProcessor from common.utils.business_event_context import current_event from .processor_registry import ProcessorRegistry class AutomagicHTMLProcessor(BaseProcessor): def __init__(self, tenant, document_version, catalog, processor): super().__init__(tenant, document_version, catalog, processor) self.chunk_size = catalog.max_chunk_size self.chunk_overlap = 0 self.tuning = self.processor.tuning self.prompt_params = { "custom_instructions": self.processor.configuration.get("custom_instructions", ""), } template, llm = get_template("automagic_html_parse") translation_prompt = ChatPromptTemplate.from_template(template) setup = RunnablePassthrough() output_parser = StrOutputParser() self.chain = (setup | translation_prompt | llm | output_parser) def process(self): self._log("Starting Automagic HTML processing") try: # Get HTML-file data file_data = minio_client.download_document_file( self.tenant.id, self.document_version.bucket_name, self.document_version.object_name, ) # Invoke HTML Processing Agent self.prompt_params["html"] = file_data with current_event.create_span("Markdown Generation"): markdown = self.chain.invoke(self.prompt_params) self._save_markdown(markdown) # Retrieve Title match = re.search(r'^# (.+)', markdown, re.MULTILINE) title = match.group(1).strip() if match else None self._log("Finished Automagic HTML Processing") return markdown, title except Exception as e: self._log(f"Error automagically processing HTML: {str(e)}", level='error') raise # Register the processor ProcessorRegistry.register("AUTOMAGIC_HTML_PROCESSOR", AutomagicHTMLProcessor)