- Introduction of the Automatic HTML Processor

- Translation Service improvement
- Enable activation / deactivation of Processors
- Renew API-keys for Mistral (leading to workspaces)
- Align all Document views to use of a session catalog
- Allow for different processors for the same file type
This commit is contained in:
Josako
2025-06-26 14:38:40 +02:00
parent f5c9542a49
commit fda267b479
35 changed files with 551 additions and 356 deletions

View File

@@ -11,6 +11,7 @@ from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from sqlalchemy import or_
from sqlalchemy.exc import SQLAlchemyError
import traceback
from common.extensions import db, cache_manager
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
@@ -24,7 +25,8 @@ from common.utils.business_event_context import current_event
from config.type_defs.processor_types import PROCESSOR_TYPES
from eveai_workers.processors.processor_registry import ProcessorRegistry
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel, EveAINoContentFound, EveAIUnsupportedFileType, \
EveAINoProcessorFound
from common.utils.config_field_types import json_to_pattern_list
@@ -58,8 +60,8 @@ def create_embeddings(tenant_id, document_version_id):
catalog = Catalog.query.get_or_404(catalog_id)
# Define processor related information
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
processor_class = ProcessorRegistry.get_processor_class(processor.type)
except Exception as e:
current_app.logger.error(f'Create Embeddings request received '
@@ -95,7 +97,7 @@ def create_embeddings(tenant_id, document_version_id):
delete_embeddings_for_document_version(document_version)
try:
with current_event.create_span(f"{processor_type} Processing"):
with current_event.create_span(f"{processor.type} Processing"):
document_processor = processor_class(
tenant=tenant,
document_version=document_version,
@@ -107,6 +109,8 @@ def create_embeddings(tenant_id, document_version_id):
'markdown': markdown,
'title': title
})
if not markdown or markdown.strip() == '':
raise EveAINoContentFound(document_version.doc_id, document_version.id)
with current_event.create_span("Embedding"):
embed_markdown(tenant, document_version, catalog, document_processor, markdown, title)
@@ -114,9 +118,11 @@ def create_embeddings(tenant_id, document_version_id):
current_event.log("Finished Embedding Creation Task")
except Exception as e:
stacktrace = traceback.format_exc()
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
f'on document version {document_version_id} '
f'error: {e}')
f'on document version {document_version_id} '
f'error: {e}\n'
f'Stacktrace: {stacktrace}')
document_version.processing = False
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing_error = str(e)[:255]
@@ -624,25 +630,9 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
ValueError: If no matching processor is found
"""
try:
current_app.logger.debug(f"Getting processor for catalog {catalog_id}, file type {file_type}, file sub_type {sub_file_type} ")
# Start with base query for catalog
query = Processor.query.filter_by(catalog_id=catalog_id)
# Find processor type that handles this file type
matching_processor_type = None
for proc_type, config in PROCESSOR_TYPES.items():
supported_types = config['file_types']
if isinstance(supported_types, str):
supported_types = [t.strip() for t in supported_types.split(',')]
if file_type in supported_types:
matching_processor_type = proc_type
break
if not matching_processor_type:
raise ValueError(f"No processor type found for file type: {file_type}")
# Add processor type condition
query = query.filter_by(type=matching_processor_type)
query = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True)
# If sub_file_type is provided, add that condition
if sub_file_type:
@@ -651,22 +641,44 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
# If no sub_file_type, prefer processors without sub_file_type specification
query = query.filter(or_(Processor.sub_file_type.is_(None),
Processor.sub_file_type == ''))
available_processors = query.all()
# Get the first matching processor
processor = query.first()
if not available_processors:
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
available_processor_types = [processor.type for processor in available_processors]
current_app.logger.debug(f"Available processors for catalog {catalog_id}: {available_processor_types}")
# Find processor type that handles this file type
matching_processor_type = None
for proc_type, config in PROCESSOR_TYPES.items():
# Alleen verwerken als dit type processor beschikbaar is in de database
if proc_type in available_processor_types:
supported_types = config['file_types']
if isinstance(supported_types, str):
supported_types = [t.strip() for t in supported_types.split(',')]
current_app.logger.debug(f"Supported types for processor type {proc_type}: {supported_types}")
if file_type in supported_types:
matching_processor_type = proc_type
break
current_app.logger.debug(f"Processor type found for catalog {catalog_id}, file type {file_type}: {matching_processor_type}")
if not matching_processor_type:
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
else:
current_app.logger.debug(f"Processor type found for file type: {file_type}: {matching_processor_type}")
processor = None
for proc in available_processors:
if proc.type == matching_processor_type:
processor = proc
break
if not processor:
if sub_file_type:
raise ValueError(
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
f"file type {file_type}, sub-type {sub_file_type}"
)
else:
raise ValueError(
f"No processor found for catalog {catalog_id}, "
f"file type {file_type}"
)
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
current_app.logger.debug(f"Processor found for catalog {catalog_id}, file type {file_type}: {processor}")
return processor
except Exception as e: