- Introduction of the Automatic HTML Processor

- Translation Service improvement
- Enable activation / deactivation of Processors
- Renew API-keys for Mistral (leading to workspaces)
- Align all Document views to use of a session catalog
- Allow for different processors for the same file type
This commit is contained in:
Josako
2025-06-26 14:38:40 +02:00
parent f5c9542a49
commit fda267b479
35 changed files with 551 additions and 356 deletions

View File

@@ -1,5 +1,5 @@
# Import all processor implementations to ensure registration
from . import audio_processor, html_processor, pdf_processor, markdown_processor, docx_processor
from . import audio_processor, html_processor, pdf_processor, markdown_processor, docx_processor, automagic_html_processor
# List of all available processor implementations
__all__ = ['audio_processor', 'html_processor', 'pdf_processor', 'markdown_processor', 'docx_processor']
__all__ = ['audio_processor', 'html_processor', 'pdf_processor', 'markdown_processor', 'docx_processor', 'automagic_html_processor']

View File

@@ -0,0 +1,65 @@
import io
import pdfplumber
from flask import current_app
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
from common.extensions import minio_client
from common.utils.model_utils import create_language_template, get_embedding_llm, get_template
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
class AutomagicHTMLProcessor(BaseProcessor):
def __init__(self, tenant, document_version, catalog, processor):
super().__init__(tenant, document_version, catalog, processor)
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
self.prompt_params = {
"custom_instructions": self.processor.configuration.get("custom_instructions", ""),
}
template, llm = get_template("automagic_html_parse")
translation_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
self.chain = (setup | translation_prompt | llm | output_parser)
def process(self):
self._log("Starting Automagic HTML processing")
try:
# Get HTML-file data
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
# Invoke HTML Processing Agent
self.prompt_params["html"] = file_data
with current_event.create_span("Markdown Generation"):
markdown = self.chain.invoke(self.prompt_params)
self._save_markdown(markdown)
# Retrieve Title
match = re.search(r'^# (.+)', markdown, re.MULTILINE)
title = match.group(1).strip() if match else None
self._log("Finished Automagic HTML Processing")
return markdown, title
except Exception as e:
self._log(f"Error automagically processing HTML: {str(e)}", level='error')
raise
# Register the processor
ProcessorRegistry.register("AUTOMAGIC_HTML_PROCESSOR", AutomagicHTMLProcessor)

View File

@@ -44,185 +44,6 @@ class PDFProcessor(BaseProcessor):
self._log(f"Error processing PDF: {str(e)}", level='error')
raise
def _extract_content(self, file_data):
extracted_content = []
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
figure_counter = 1
for page_num, page in enumerate(pdf.pages):
self._log(f"Extracting content from page {page_num + 1}")
page_content = {
'text': page.extract_text(),
'figures': self._extract_figures(page, page_num, figure_counter),
'tables': self._extract_tables(page)
}
self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
figure_counter += len(page_content['figures'])
extracted_content.append(page_content)
return extracted_content
def _extract_figures(self, page, page_num, figure_counter):
figures = []
# Omit figure processing for now!
# for img in page.images:
# try:
# # Try to get the bbox, use full page dimensions if not available
# bbox = img.get('bbox', (0, 0, page.width, page.height))
#
# figure = {
# 'figure_number': figure_counter,
# 'filename': f"figure_{page_num + 1}_{figure_counter}.png",
# 'caption': self._find_figure_caption(page, bbox)
# }
#
# # Extract the figure as an image
# figure_image = page.within_bbox(bbox).to_image()
#
# # Save the figure using MinIO
# with io.BytesIO() as output:
# figure_image.save(output, format='PNG')
# output.seek(0)
# minio_client.upload_document_file(
# self.tenant.id,
# self.document_version.doc_id,
# self.document_version.language,
# self.document_version.id,
# figure['filename'],
# output.getvalue()
# )
#
# figures.append(figure)
# figure_counter += 1
# except Exception as e:
# self._log(f"Error processing figure on page {page_num + 1}: {str(e)}", level='error')
return figures
def _find_figure_caption(self, page, bbox):
try:
# Look for text below the figure
caption_bbox = (bbox[0], bbox[3], bbox[2], min(bbox[3] + 50, page.height))
caption_text = page.crop(caption_bbox).extract_text()
if caption_text and caption_text.lower().startswith('figure'):
return caption_text
except Exception as e:
self._log(f"Error finding figure caption: {str(e)}", level='error')
return None
def _extract_tables(self, page):
tables = []
try:
for table in page.extract_tables():
if table:
markdown_table = self._table_to_markdown(table)
if markdown_table: # Only add non-empty tables
tables.append(markdown_table)
self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
except Exception as e:
self._log(f"Error extracting tables from page: {str(e)}", level='error')
return tables
def _table_to_markdown(self, table):
if not table or not table[0]: # Check if table is empty or first row is empty
return "" # Return empty string for empty tables
def clean_cell(cell):
if cell is None:
return "" # Convert None to empty string
return str(cell).replace("|", "\\|") # Escape pipe characters and convert to string
header = [clean_cell(cell) for cell in table[0]]
markdown = "| " + " | ".join(header) + " |\n"
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
for row in table[1:]:
cleaned_row = [clean_cell(cell) for cell in row]
markdown += "| " + " | ".join(cleaned_row) + " |\n"
return markdown
def _structure_content(self, extracted_content):
structured_content = ""
title = "Untitled Document"
current_heading_level = 0
heading_pattern = re.compile(r'^(\d+(\.\d+)*\.?\s*)?(.+)$')
def identify_heading(text):
match = heading_pattern.match(text.strip())
if match:
numbering, _, content = match.groups()
if numbering:
level = numbering.count('.') + 1
return level, f"{numbering}{content}"
else:
return 1, content # Assume it's a top-level heading if no numbering
return 0, text # Not a heading
for page in extracted_content:
# Assume the title is on the first page
if page == extracted_content[0]:
lines = page.get('text', '').split('\n')
if lines:
title = lines[0].strip() # Use the first non-empty line as the title
# Process text
paragraphs = page['text'].split('\n\n')
for para in paragraphs:
lines = para.strip().split('\n')
if len(lines) == 1: # Potential heading
level, text = identify_heading(lines[0])
if level > 0:
heading_marks = '#' * level
structured_content += f"\n\n{heading_marks} {text}\n\n"
if level == 1 and not title:
title = text # Use the first top-level heading as the title if not set
else:
structured_content += f"{para}\n\n" # Treat as normal paragraph
else:
structured_content += f"{para}\n\n" # Multi-line paragraph
# Process figures
for figure in page.get('figures', []):
structured_content += f"\n\n![Figure {figure['figure_number']}]({figure['filename']})\n\n"
if figure['caption']:
structured_content += f"*Figure {figure['figure_number']}: {figure['caption']}*\n\n"
# Add tables
if 'tables' in page:
for table in page['tables']:
structured_content += f"\n{table}\n"
if self.tuning:
self._save_intermediate(structured_content, "structured_content.md")
return structured_content, title
def _split_content_for_llm(self, content):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
return text_splitter.split_text(content)
def _process_chunks_with_llm(self, chunks):
template, llm = get_template('pdf_parse')
pdf_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | pdf_prompt | llm | output_parser
markdown_chunks = []
for chunk in chunks:
input = {"pdf_content": chunk}
result = chain.invoke(input)
result = self._clean_markdown(result)
markdown_chunks.append(result)
return "\n\n".join(markdown_chunks)
# Register the processor
ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)

View File

@@ -11,6 +11,7 @@ from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from sqlalchemy import or_
from sqlalchemy.exc import SQLAlchemyError
import traceback
from common.extensions import db, cache_manager
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
@@ -24,7 +25,8 @@ from common.utils.business_event_context import current_event
from config.type_defs.processor_types import PROCESSOR_TYPES
from eveai_workers.processors.processor_registry import ProcessorRegistry
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel, EveAINoContentFound, EveAIUnsupportedFileType, \
EveAINoProcessorFound
from common.utils.config_field_types import json_to_pattern_list
@@ -58,8 +60,8 @@ def create_embeddings(tenant_id, document_version_id):
catalog = Catalog.query.get_or_404(catalog_id)
# Define processor related information
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
processor_class = ProcessorRegistry.get_processor_class(processor.type)
except Exception as e:
current_app.logger.error(f'Create Embeddings request received '
@@ -95,7 +97,7 @@ def create_embeddings(tenant_id, document_version_id):
delete_embeddings_for_document_version(document_version)
try:
with current_event.create_span(f"{processor_type} Processing"):
with current_event.create_span(f"{processor.type} Processing"):
document_processor = processor_class(
tenant=tenant,
document_version=document_version,
@@ -107,6 +109,8 @@ def create_embeddings(tenant_id, document_version_id):
'markdown': markdown,
'title': title
})
if not markdown or markdown.strip() == '':
raise EveAINoContentFound(document_version.doc_id, document_version.id)
with current_event.create_span("Embedding"):
embed_markdown(tenant, document_version, catalog, document_processor, markdown, title)
@@ -114,9 +118,11 @@ def create_embeddings(tenant_id, document_version_id):
current_event.log("Finished Embedding Creation Task")
except Exception as e:
stacktrace = traceback.format_exc()
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
f'on document version {document_version_id} '
f'error: {e}')
f'on document version {document_version_id} '
f'error: {e}\n'
f'Stacktrace: {stacktrace}')
document_version.processing = False
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing_error = str(e)[:255]
@@ -624,25 +630,9 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
ValueError: If no matching processor is found
"""
try:
current_app.logger.debug(f"Getting processor for catalog {catalog_id}, file type {file_type}, file sub_type {sub_file_type} ")
# Start with base query for catalog
query = Processor.query.filter_by(catalog_id=catalog_id)
# Find processor type that handles this file type
matching_processor_type = None
for proc_type, config in PROCESSOR_TYPES.items():
supported_types = config['file_types']
if isinstance(supported_types, str):
supported_types = [t.strip() for t in supported_types.split(',')]
if file_type in supported_types:
matching_processor_type = proc_type
break
if not matching_processor_type:
raise ValueError(f"No processor type found for file type: {file_type}")
# Add processor type condition
query = query.filter_by(type=matching_processor_type)
query = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True)
# If sub_file_type is provided, add that condition
if sub_file_type:
@@ -651,22 +641,44 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
# If no sub_file_type, prefer processors without sub_file_type specification
query = query.filter(or_(Processor.sub_file_type.is_(None),
Processor.sub_file_type == ''))
available_processors = query.all()
# Get the first matching processor
processor = query.first()
if not available_processors:
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
available_processor_types = [processor.type for processor in available_processors]
current_app.logger.debug(f"Available processors for catalog {catalog_id}: {available_processor_types}")
# Find processor type that handles this file type
matching_processor_type = None
for proc_type, config in PROCESSOR_TYPES.items():
# Alleen verwerken als dit type processor beschikbaar is in de database
if proc_type in available_processor_types:
supported_types = config['file_types']
if isinstance(supported_types, str):
supported_types = [t.strip() for t in supported_types.split(',')]
current_app.logger.debug(f"Supported types for processor type {proc_type}: {supported_types}")
if file_type in supported_types:
matching_processor_type = proc_type
break
current_app.logger.debug(f"Processor type found for catalog {catalog_id}, file type {file_type}: {matching_processor_type}")
if not matching_processor_type:
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
else:
current_app.logger.debug(f"Processor type found for file type: {file_type}: {matching_processor_type}")
processor = None
for proc in available_processors:
if proc.type == matching_processor_type:
processor = proc
break
if not processor:
if sub_file_type:
raise ValueError(
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
f"file type {file_type}, sub-type {sub_file_type}"
)
else:
raise ValueError(
f"No processor found for catalog {catalog_id}, "
f"file type {file_type}"
)
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
current_app.logger.debug(f"Processor found for catalog {catalog_id}, file type {file_type}: {processor}")
return processor
except Exception as e: