- Introduction of the Automatic HTML Processor
- Translation Service improvement - Enable activation / deactivation of Processors - Renew API-keys for Mistral (leading to workspaces) - Align all Document views to use of a session catalog - Allow for different processors for the same file type
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
# Import all processor implementations to ensure registration
|
||||
from . import audio_processor, html_processor, pdf_processor, markdown_processor, docx_processor
|
||||
from . import audio_processor, html_processor, pdf_processor, markdown_processor, docx_processor, automagic_html_processor
|
||||
|
||||
# List of all available processor implementations
|
||||
__all__ = ['audio_processor', 'html_processor', 'pdf_processor', 'markdown_processor', 'docx_processor']
|
||||
__all__ = ['audio_processor', 'html_processor', 'pdf_processor', 'markdown_processor', 'docx_processor', 'automagic_html_processor']
|
||||
65
eveai_workers/processors/automagic_html_processor.py
Normal file
65
eveai_workers/processors/automagic_html_processor.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import io
|
||||
import pdfplumber
|
||||
from flask import current_app
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
import re
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template, get_embedding_llm, get_template
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
from .processor_registry import ProcessorRegistry
|
||||
|
||||
|
||||
class AutomagicHTMLProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, document_version, catalog, processor):
|
||||
super().__init__(tenant, document_version, catalog, processor)
|
||||
|
||||
self.chunk_size = catalog.max_chunk_size
|
||||
self.chunk_overlap = 0
|
||||
self.tuning = self.processor.tuning
|
||||
|
||||
self.prompt_params = {
|
||||
"custom_instructions": self.processor.configuration.get("custom_instructions", ""),
|
||||
}
|
||||
template, llm = get_template("automagic_html_parse")
|
||||
|
||||
translation_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
self.chain = (setup | translation_prompt | llm | output_parser)
|
||||
|
||||
|
||||
def process(self):
|
||||
self._log("Starting Automagic HTML processing")
|
||||
try:
|
||||
# Get HTML-file data
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
|
||||
# Invoke HTML Processing Agent
|
||||
self.prompt_params["html"] = file_data
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
markdown = self.chain.invoke(self.prompt_params)
|
||||
self._save_markdown(markdown)
|
||||
|
||||
# Retrieve Title
|
||||
match = re.search(r'^# (.+)', markdown, re.MULTILINE)
|
||||
title = match.group(1).strip() if match else None
|
||||
|
||||
self._log("Finished Automagic HTML Processing")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
self._log(f"Error automagically processing HTML: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("AUTOMAGIC_HTML_PROCESSOR", AutomagicHTMLProcessor)
|
||||
@@ -44,185 +44,6 @@ class PDFProcessor(BaseProcessor):
|
||||
self._log(f"Error processing PDF: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _extract_content(self, file_data):
|
||||
extracted_content = []
|
||||
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
|
||||
figure_counter = 1
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
self._log(f"Extracting content from page {page_num + 1}")
|
||||
page_content = {
|
||||
'text': page.extract_text(),
|
||||
'figures': self._extract_figures(page, page_num, figure_counter),
|
||||
'tables': self._extract_tables(page)
|
||||
}
|
||||
self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
|
||||
figure_counter += len(page_content['figures'])
|
||||
extracted_content.append(page_content)
|
||||
|
||||
return extracted_content
|
||||
|
||||
def _extract_figures(self, page, page_num, figure_counter):
|
||||
figures = []
|
||||
# Omit figure processing for now!
|
||||
# for img in page.images:
|
||||
# try:
|
||||
# # Try to get the bbox, use full page dimensions if not available
|
||||
# bbox = img.get('bbox', (0, 0, page.width, page.height))
|
||||
#
|
||||
# figure = {
|
||||
# 'figure_number': figure_counter,
|
||||
# 'filename': f"figure_{page_num + 1}_{figure_counter}.png",
|
||||
# 'caption': self._find_figure_caption(page, bbox)
|
||||
# }
|
||||
#
|
||||
# # Extract the figure as an image
|
||||
# figure_image = page.within_bbox(bbox).to_image()
|
||||
#
|
||||
# # Save the figure using MinIO
|
||||
# with io.BytesIO() as output:
|
||||
# figure_image.save(output, format='PNG')
|
||||
# output.seek(0)
|
||||
# minio_client.upload_document_file(
|
||||
# self.tenant.id,
|
||||
# self.document_version.doc_id,
|
||||
# self.document_version.language,
|
||||
# self.document_version.id,
|
||||
# figure['filename'],
|
||||
# output.getvalue()
|
||||
# )
|
||||
#
|
||||
# figures.append(figure)
|
||||
# figure_counter += 1
|
||||
# except Exception as e:
|
||||
# self._log(f"Error processing figure on page {page_num + 1}: {str(e)}", level='error')
|
||||
|
||||
return figures
|
||||
|
||||
def _find_figure_caption(self, page, bbox):
|
||||
try:
|
||||
# Look for text below the figure
|
||||
caption_bbox = (bbox[0], bbox[3], bbox[2], min(bbox[3] + 50, page.height))
|
||||
caption_text = page.crop(caption_bbox).extract_text()
|
||||
if caption_text and caption_text.lower().startswith('figure'):
|
||||
return caption_text
|
||||
except Exception as e:
|
||||
self._log(f"Error finding figure caption: {str(e)}", level='error')
|
||||
return None
|
||||
|
||||
def _extract_tables(self, page):
|
||||
tables = []
|
||||
try:
|
||||
for table in page.extract_tables():
|
||||
if table:
|
||||
markdown_table = self._table_to_markdown(table)
|
||||
if markdown_table: # Only add non-empty tables
|
||||
tables.append(markdown_table)
|
||||
self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
|
||||
except Exception as e:
|
||||
self._log(f"Error extracting tables from page: {str(e)}", level='error')
|
||||
return tables
|
||||
|
||||
def _table_to_markdown(self, table):
|
||||
if not table or not table[0]: # Check if table is empty or first row is empty
|
||||
return "" # Return empty string for empty tables
|
||||
|
||||
def clean_cell(cell):
|
||||
if cell is None:
|
||||
return "" # Convert None to empty string
|
||||
return str(cell).replace("|", "\\|") # Escape pipe characters and convert to string
|
||||
|
||||
header = [clean_cell(cell) for cell in table[0]]
|
||||
markdown = "| " + " | ".join(header) + " |\n"
|
||||
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
|
||||
|
||||
for row in table[1:]:
|
||||
cleaned_row = [clean_cell(cell) for cell in row]
|
||||
markdown += "| " + " | ".join(cleaned_row) + " |\n"
|
||||
|
||||
return markdown
|
||||
|
||||
def _structure_content(self, extracted_content):
|
||||
structured_content = ""
|
||||
title = "Untitled Document"
|
||||
current_heading_level = 0
|
||||
heading_pattern = re.compile(r'^(\d+(\.\d+)*\.?\s*)?(.+)$')
|
||||
|
||||
def identify_heading(text):
|
||||
match = heading_pattern.match(text.strip())
|
||||
if match:
|
||||
numbering, _, content = match.groups()
|
||||
if numbering:
|
||||
level = numbering.count('.') + 1
|
||||
return level, f"{numbering}{content}"
|
||||
else:
|
||||
return 1, content # Assume it's a top-level heading if no numbering
|
||||
return 0, text # Not a heading
|
||||
|
||||
for page in extracted_content:
|
||||
# Assume the title is on the first page
|
||||
if page == extracted_content[0]:
|
||||
lines = page.get('text', '').split('\n')
|
||||
if lines:
|
||||
title = lines[0].strip() # Use the first non-empty line as the title
|
||||
|
||||
# Process text
|
||||
paragraphs = page['text'].split('\n\n')
|
||||
|
||||
for para in paragraphs:
|
||||
lines = para.strip().split('\n')
|
||||
if len(lines) == 1: # Potential heading
|
||||
level, text = identify_heading(lines[0])
|
||||
if level > 0:
|
||||
heading_marks = '#' * level
|
||||
structured_content += f"\n\n{heading_marks} {text}\n\n"
|
||||
if level == 1 and not title:
|
||||
title = text # Use the first top-level heading as the title if not set
|
||||
else:
|
||||
structured_content += f"{para}\n\n" # Treat as normal paragraph
|
||||
else:
|
||||
structured_content += f"{para}\n\n" # Multi-line paragraph
|
||||
|
||||
# Process figures
|
||||
for figure in page.get('figures', []):
|
||||
structured_content += f"\n\n![Figure {figure['figure_number']}]({figure['filename']})\n\n"
|
||||
if figure['caption']:
|
||||
structured_content += f"*Figure {figure['figure_number']}: {figure['caption']}*\n\n"
|
||||
|
||||
# Add tables
|
||||
if 'tables' in page:
|
||||
for table in page['tables']:
|
||||
structured_content += f"\n{table}\n"
|
||||
|
||||
if self.tuning:
|
||||
self._save_intermediate(structured_content, "structured_content.md")
|
||||
|
||||
return structured_content, title
|
||||
|
||||
def _split_content_for_llm(self, content):
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
length_function=len,
|
||||
separators=["\n\n", "\n", " ", ""]
|
||||
)
|
||||
return text_splitter.split_text(content)
|
||||
|
||||
def _process_chunks_with_llm(self, chunks):
|
||||
template, llm = get_template('pdf_parse')
|
||||
pdf_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
chain = setup | pdf_prompt | llm | output_parser
|
||||
|
||||
markdown_chunks = []
|
||||
for chunk in chunks:
|
||||
input = {"pdf_content": chunk}
|
||||
result = chain.invoke(input)
|
||||
result = self._clean_markdown(result)
|
||||
markdown_chunks.append(result)
|
||||
|
||||
return "\n\n".join(markdown_chunks)
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)
|
||||
|
||||
@@ -11,6 +11,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
import traceback
|
||||
|
||||
from common.extensions import db, cache_manager
|
||||
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
|
||||
@@ -24,7 +25,8 @@ from common.utils.business_event_context import current_event
|
||||
from config.type_defs.processor_types import PROCESSOR_TYPES
|
||||
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
||||
|
||||
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
|
||||
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel, EveAINoContentFound, EveAIUnsupportedFileType, \
|
||||
EveAINoProcessorFound
|
||||
|
||||
from common.utils.config_field_types import json_to_pattern_list
|
||||
|
||||
@@ -58,8 +60,8 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
catalog = Catalog.query.get_or_404(catalog_id)
|
||||
|
||||
# Define processor related information
|
||||
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
|
||||
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
|
||||
processor_class = ProcessorRegistry.get_processor_class(processor.type)
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Create Embeddings request received '
|
||||
@@ -95,7 +97,7 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
delete_embeddings_for_document_version(document_version)
|
||||
|
||||
try:
|
||||
with current_event.create_span(f"{processor_type} Processing"):
|
||||
with current_event.create_span(f"{processor.type} Processing"):
|
||||
document_processor = processor_class(
|
||||
tenant=tenant,
|
||||
document_version=document_version,
|
||||
@@ -107,6 +109,8 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
'markdown': markdown,
|
||||
'title': title
|
||||
})
|
||||
if not markdown or markdown.strip() == '':
|
||||
raise EveAINoContentFound(document_version.doc_id, document_version.id)
|
||||
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, document_version, catalog, document_processor, markdown, title)
|
||||
@@ -114,9 +118,11 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
current_event.log("Finished Embedding Creation Task")
|
||||
|
||||
except Exception as e:
|
||||
stacktrace = traceback.format_exc()
|
||||
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
|
||||
f'on document version {document_version_id} '
|
||||
f'error: {e}')
|
||||
f'on document version {document_version_id} '
|
||||
f'error: {e}\n'
|
||||
f'Stacktrace: {stacktrace}')
|
||||
document_version.processing = False
|
||||
document_version.processing_finished_at = dt.now(tz.utc)
|
||||
document_version.processing_error = str(e)[:255]
|
||||
@@ -624,25 +630,9 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
|
||||
ValueError: If no matching processor is found
|
||||
"""
|
||||
try:
|
||||
current_app.logger.debug(f"Getting processor for catalog {catalog_id}, file type {file_type}, file sub_type {sub_file_type} ")
|
||||
# Start with base query for catalog
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id)
|
||||
|
||||
# Find processor type that handles this file type
|
||||
matching_processor_type = None
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
|
||||
if file_type in supported_types:
|
||||
matching_processor_type = proc_type
|
||||
break
|
||||
|
||||
if not matching_processor_type:
|
||||
raise ValueError(f"No processor type found for file type: {file_type}")
|
||||
|
||||
# Add processor type condition
|
||||
query = query.filter_by(type=matching_processor_type)
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True)
|
||||
|
||||
# If sub_file_type is provided, add that condition
|
||||
if sub_file_type:
|
||||
@@ -651,22 +641,44 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
|
||||
# If no sub_file_type, prefer processors without sub_file_type specification
|
||||
query = query.filter(or_(Processor.sub_file_type.is_(None),
|
||||
Processor.sub_file_type == ''))
|
||||
|
||||
available_processors = query.all()
|
||||
|
||||
# Get the first matching processor
|
||||
processor = query.first()
|
||||
if not available_processors:
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
available_processor_types = [processor.type for processor in available_processors]
|
||||
current_app.logger.debug(f"Available processors for catalog {catalog_id}: {available_processor_types}")
|
||||
|
||||
# Find processor type that handles this file type
|
||||
matching_processor_type = None
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
# Alleen verwerken als dit type processor beschikbaar is in de database
|
||||
if proc_type in available_processor_types:
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
current_app.logger.debug(f"Supported types for processor type {proc_type}: {supported_types}")
|
||||
|
||||
if file_type in supported_types:
|
||||
matching_processor_type = proc_type
|
||||
break
|
||||
|
||||
current_app.logger.debug(f"Processor type found for catalog {catalog_id}, file type {file_type}: {matching_processor_type}")
|
||||
if not matching_processor_type:
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
else:
|
||||
current_app.logger.debug(f"Processor type found for file type: {file_type}: {matching_processor_type}")
|
||||
|
||||
processor = None
|
||||
for proc in available_processors:
|
||||
if proc.type == matching_processor_type:
|
||||
processor = proc
|
||||
break
|
||||
|
||||
if not processor:
|
||||
if sub_file_type:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
|
||||
f"file type {file_type}, sub-type {sub_file_type}"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id}, "
|
||||
f"file type {file_type}"
|
||||
)
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
|
||||
current_app.logger.debug(f"Processor found for catalog {catalog_id}, file type {file_type}: {processor}")
|
||||
return processor
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user