- Introduction of the Automatic HTML Processor
- Translation Service improvement - Enable activation / deactivation of Processors - Renew API-keys for Mistral (leading to workspaces) - Align all Document views to use of a session catalog - Allow for different processors for the same file type
This commit is contained in:
@@ -11,6 +11,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
import traceback
|
||||
|
||||
from common.extensions import db, cache_manager
|
||||
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
|
||||
@@ -24,7 +25,8 @@ from common.utils.business_event_context import current_event
|
||||
from config.type_defs.processor_types import PROCESSOR_TYPES
|
||||
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
||||
|
||||
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
|
||||
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel, EveAINoContentFound, EveAIUnsupportedFileType, \
|
||||
EveAINoProcessorFound
|
||||
|
||||
from common.utils.config_field_types import json_to_pattern_list
|
||||
|
||||
@@ -58,8 +60,8 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
catalog = Catalog.query.get_or_404(catalog_id)
|
||||
|
||||
# Define processor related information
|
||||
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
|
||||
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
|
||||
processor_class = ProcessorRegistry.get_processor_class(processor.type)
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Create Embeddings request received '
|
||||
@@ -95,7 +97,7 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
delete_embeddings_for_document_version(document_version)
|
||||
|
||||
try:
|
||||
with current_event.create_span(f"{processor_type} Processing"):
|
||||
with current_event.create_span(f"{processor.type} Processing"):
|
||||
document_processor = processor_class(
|
||||
tenant=tenant,
|
||||
document_version=document_version,
|
||||
@@ -107,6 +109,8 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
'markdown': markdown,
|
||||
'title': title
|
||||
})
|
||||
if not markdown or markdown.strip() == '':
|
||||
raise EveAINoContentFound(document_version.doc_id, document_version.id)
|
||||
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, document_version, catalog, document_processor, markdown, title)
|
||||
@@ -114,9 +118,11 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
current_event.log("Finished Embedding Creation Task")
|
||||
|
||||
except Exception as e:
|
||||
stacktrace = traceback.format_exc()
|
||||
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
|
||||
f'on document version {document_version_id} '
|
||||
f'error: {e}')
|
||||
f'on document version {document_version_id} '
|
||||
f'error: {e}\n'
|
||||
f'Stacktrace: {stacktrace}')
|
||||
document_version.processing = False
|
||||
document_version.processing_finished_at = dt.now(tz.utc)
|
||||
document_version.processing_error = str(e)[:255]
|
||||
@@ -624,25 +630,9 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
|
||||
ValueError: If no matching processor is found
|
||||
"""
|
||||
try:
|
||||
current_app.logger.debug(f"Getting processor for catalog {catalog_id}, file type {file_type}, file sub_type {sub_file_type} ")
|
||||
# Start with base query for catalog
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id)
|
||||
|
||||
# Find processor type that handles this file type
|
||||
matching_processor_type = None
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
|
||||
if file_type in supported_types:
|
||||
matching_processor_type = proc_type
|
||||
break
|
||||
|
||||
if not matching_processor_type:
|
||||
raise ValueError(f"No processor type found for file type: {file_type}")
|
||||
|
||||
# Add processor type condition
|
||||
query = query.filter_by(type=matching_processor_type)
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True)
|
||||
|
||||
# If sub_file_type is provided, add that condition
|
||||
if sub_file_type:
|
||||
@@ -651,22 +641,44 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
|
||||
# If no sub_file_type, prefer processors without sub_file_type specification
|
||||
query = query.filter(or_(Processor.sub_file_type.is_(None),
|
||||
Processor.sub_file_type == ''))
|
||||
|
||||
available_processors = query.all()
|
||||
|
||||
# Get the first matching processor
|
||||
processor = query.first()
|
||||
if not available_processors:
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
available_processor_types = [processor.type for processor in available_processors]
|
||||
current_app.logger.debug(f"Available processors for catalog {catalog_id}: {available_processor_types}")
|
||||
|
||||
# Find processor type that handles this file type
|
||||
matching_processor_type = None
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
# Alleen verwerken als dit type processor beschikbaar is in de database
|
||||
if proc_type in available_processor_types:
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
current_app.logger.debug(f"Supported types for processor type {proc_type}: {supported_types}")
|
||||
|
||||
if file_type in supported_types:
|
||||
matching_processor_type = proc_type
|
||||
break
|
||||
|
||||
current_app.logger.debug(f"Processor type found for catalog {catalog_id}, file type {file_type}: {matching_processor_type}")
|
||||
if not matching_processor_type:
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
else:
|
||||
current_app.logger.debug(f"Processor type found for file type: {file_type}: {matching_processor_type}")
|
||||
|
||||
processor = None
|
||||
for proc in available_processors:
|
||||
if proc.type == matching_processor_type:
|
||||
processor = proc
|
||||
break
|
||||
|
||||
if not processor:
|
||||
if sub_file_type:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
|
||||
f"file type {file_type}, sub-type {sub_file_type}"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id}, "
|
||||
f"file type {file_type}"
|
||||
)
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
|
||||
current_app.logger.debug(f"Processor found for catalog {catalog_id}, file type {file_type}: {processor}")
|
||||
return processor
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user