- Introduction of the Automatic HTML Processor
- Translation Service improvement - Enable activation / deactivation of Processors - Renew API-keys for Mistral (leading to workspaces) - Align all Document views to use of a session catalog - Allow for different processors for the same file type
This commit is contained in:
65
eveai_workers/processors/automagic_html_processor.py
Normal file
65
eveai_workers/processors/automagic_html_processor.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import io
|
||||
import pdfplumber
|
||||
from flask import current_app
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
import re
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template, get_embedding_llm, get_template
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
from .processor_registry import ProcessorRegistry
|
||||
|
||||
|
||||
class AutomagicHTMLProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, document_version, catalog, processor):
|
||||
super().__init__(tenant, document_version, catalog, processor)
|
||||
|
||||
self.chunk_size = catalog.max_chunk_size
|
||||
self.chunk_overlap = 0
|
||||
self.tuning = self.processor.tuning
|
||||
|
||||
self.prompt_params = {
|
||||
"custom_instructions": self.processor.configuration.get("custom_instructions", ""),
|
||||
}
|
||||
template, llm = get_template("automagic_html_parse")
|
||||
|
||||
translation_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
self.chain = (setup | translation_prompt | llm | output_parser)
|
||||
|
||||
|
||||
def process(self):
|
||||
self._log("Starting Automagic HTML processing")
|
||||
try:
|
||||
# Get HTML-file data
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
|
||||
# Invoke HTML Processing Agent
|
||||
self.prompt_params["html"] = file_data
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
markdown = self.chain.invoke(self.prompt_params)
|
||||
self._save_markdown(markdown)
|
||||
|
||||
# Retrieve Title
|
||||
match = re.search(r'^# (.+)', markdown, re.MULTILINE)
|
||||
title = match.group(1).strip() if match else None
|
||||
|
||||
self._log("Finished Automagic HTML Processing")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
self._log(f"Error automagically processing HTML: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("AUTOMAGIC_HTML_PROCESSOR", AutomagicHTMLProcessor)
|
||||
Reference in New Issue
Block a user