- Introduction of the Automatic HTML Processor

- Translation Service improvement - Enable activation / deactivation of Processors - Renew API-keys for Mistral (leading to workspaces) - Align all Document views to use of a session catalog - Allow for different processors for the same file type
2025-06-26 14:38:40 +02:00
parent f5c9542a49
commit fda267b479
35 changed files with 551 additions and 356 deletions
--- a/common/langchain/llm_metrics_handler.py
+++ b/common/langchain/llm_metrics_handler.py
@@ -3,7 +3,6 @@ from langchain.callbacks.base import BaseCallbackHandler
 from typing import Dict, Any, List
 from langchain.schema import LLMResult
 from common.utils.business_event_context import current_event
-from flask import current_app


 class LLMMetricsHandler(BaseCallbackHandler):
--- a/common/langchain/persistent_llm_metrics_handler.py
+++ b/common/langchain/persistent_llm_metrics_handler.py
@@ -0,0 +1,47 @@
+import time
+from langchain.callbacks.base import BaseCallbackHandler
+from typing import Dict, Any, List
+from langchain.schema import LLMResult
+from common.utils.business_event_context import current_event
+
+
+class PersistentLLMMetricsHandler(BaseCallbackHandler):
+    """Metrics handler that allows metrics to be retrieved from within any call. In case metrics are required for other
+    purposes than business event logging."""
+
+    def __init__(self):
+        self.total_tokens: int = 0
+        self.prompt_tokens: int = 0
+        self.completion_tokens: int = 0
+        self.start_time: float = 0
+        self.end_time: float = 0
+        self.total_time: float = 0
+
+    def reset(self):
+        self.total_tokens = 0
+        self.prompt_tokens = 0
+        self.completion_tokens = 0
+        self.start_time = 0
+        self.end_time = 0
+        self.total_time = 0
+
+    def on_llm_start(self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any) -> None:
+        self.start_time = time.time()
+
+    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
+        self.end_time = time.time()
+        self.total_time = self.end_time - self.start_time
+
+        usage = response.llm_output.get('token_usage', {})
+        self.prompt_tokens += usage.get('prompt_tokens', 0)
+        self.completion_tokens += usage.get('completion_tokens', 0)
+        self.total_tokens = self.prompt_tokens + self.completion_tokens
+
+    def get_metrics(self) -> Dict[str, int | float]:
+        return {
+            'total_tokens': self.total_tokens,
+            'prompt_tokens': self.prompt_tokens,
+            'completion_tokens': self.completion_tokens,
+            'time_elapsed': self.total_time,
+            'interaction_type': 'LLM',
+        }
--- a/common/models/document.py
+++ b/common/models/document.py
@@ -34,6 +34,7 @@ class Processor(db.Model):
    catalog_id = db.Column(db.Integer, db.ForeignKey('catalog.id'), nullable=True)
    type = db.Column(db.String(50), nullable=False)
    sub_file_type = db.Column(db.String(50), nullable=True)
+    active = db.Column(db.Boolean, nullable=True, default=True)

    # Tuning enablers
    tuning = db.Column(db.Boolean, nullable=True, default=False)
--- a/common/models/user.py
+++ b/common/models/user.py
@@ -331,8 +331,8 @@ class TranslationCache(db.Model):
    context = db.Column(db.Text, nullable=True)

    # Translation cost
-    input_tokens = db.Column(db.Integer, nullable=False)
-    output_tokens = db.Column(db.Integer, nullable=False)
+    prompt_tokens = db.Column(db.Integer, nullable=False)
+    completion_tokens = db.Column(db.Integer, nullable=False)

    # Tracking
    created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
--- a/common/services/utils/translation_services.py
+++ b/common/services/utils/translation_services.py
@@ -0,0 +1,43 @@
+import xxhash
+import json
+
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+
+from common.langchain.persistent_llm_metrics_handler import PersistentLLMMetricsHandler
+from common.utils.model_utils import get_template, replace_variable_in_template
+
+class TranslationService:
+    def __init__(self, tenant_id):
+        self.tenant_id = tenant_id
+
+    def translate_text(self, text_to_translate: str, target_lang: str, source_lang: str = None, context: str = None) -> tuple[
+        str, dict[str, int | float]]:
+        prompt_params = {
+            "text_to_translate": text_to_translate,
+            "target_lang": target_lang,
+        }
+        if context:
+            template, llm = get_template("translation_with_context")
+            prompt_params["context"] = context
+        else:
+            template, llm = get_template("translation_without_context")
+
+        # Add a metrics handler to capture usage
+
+        metrics_handler = PersistentLLMMetricsHandler()
+        existing_callbacks = llm.callbacks
+        llm.callbacks = existing_callbacks + [metrics_handler]
+
+        translation_prompt = ChatPromptTemplate.from_template(template)
+
+        setup = RunnablePassthrough()
+
+        chain = (setup | translation_prompt | llm | StrOutputParser())
+
+        translation = chain.invoke(prompt_params)
+
+        metrics = metrics_handler.get_metrics()
+
+        return translation, metrics
--- a/common/utils/cache/translation_cache.py
+++ b/common/utils/cache/translation_cache.py
@@ -0,0 +1,156 @@
+import json
+from typing import Dict, Any, Optional
+from datetime import datetime as dt, timezone as tz
+
+import xxhash
+from flask import current_app
+from sqlalchemy import and_
+from sqlalchemy.inspection import inspect
+
+from common.utils.cache.base import CacheHandler, T
+from common.extensions import db
+
+from common.models.user import TranslationCache
+from common.services.utils.translation_services import TranslationService
+from flask_security import current_user
+
+
+class TranslationCacheHandler(CacheHandler[TranslationCache]):
+    """Handles caching of translations with fallback to database and external translation service"""
+    handler_name = 'translation_cache'
+
+    def __init__(self, region):
+        super().__init__(region, 'translation')
+        self.configure_keys('hash_key')
+
+    def _to_cache_data(self, instance: TranslationCache) -> Dict[str, Any]:
+        """Convert TranslationCache instance to cache data using SQLAlchemy inspection"""
+        if not instance:
+            return {}
+
+        mapper = inspect(TranslationCache)
+        data = {}
+
+        for column in mapper.columns:
+            value = getattr(instance, column.name)
+
+            # Handle date serialization
+            if isinstance(value, dt):
+                data[column.name] = value.isoformat()
+            else:
+                data[column.name] = value
+
+        return data
+
+    def _from_cache_data(self, data: Dict[str, Any], **kwargs) -> TranslationCache:
+        if not data:
+            return None
+
+        # Create a new TranslationCache instance
+        translation = TranslationCache()
+        mapper = inspect(TranslationCache)
+
+        # Set all attributes dynamically
+        for column in mapper.columns:
+            if column.name in data:
+                value = data[column.name]
+
+                # Handle date deserialization
+                if column.name.endswith('_date') and value:
+                    if isinstance(value, str):
+                        value = dt.fromisoformat(value).date()
+
+                setattr(translation, column.name, value)
+
+        return translation
+
+    def _should_cache(self, value: TranslationCache) -> bool:
+        """Validate if the translation should be cached"""
+        return value is not None and value.cache_key is not None
+
+    def get_translation(self, text: str, target_lang: str, source_lang:str=None, context: str=None) -> Optional[TranslationCache]:
+        """
+        Get the translation for a text in a specific language
+
+        Args:
+            text: The text to be translated
+            target_lang: The target language for the translation
+            source_lang: The source language of the text to be translated
+            context: Optional context for the translation
+
+        Returns:
+            TranslationCache instance if found, None otherwise
+        """
+
+        def creator_func(text: str, target_lang: str, source_lang: str=None, context: str=None) -> Optional[TranslationCache]:
+            # Generate cache key based on inputs
+            cache_key = self._generate_cache_key(text, target_lang, source_lang, context)
+
+            # Check if translation already exists in database
+            existing_translation = db.session.query(TranslationCache).filter_by(cache_key=cache_key).first()
+
+            if existing_translation:
+                # Update last used timestamp
+                existing_translation.last_used_at = dt.now(tz=tz.utc)
+                db.session.commit()
+                return existing_translation
+
+            # Translation not found in DB, need to create it
+            # Initialize translation service
+            translation_service = TranslationService(getattr(current_app, 'tenant_id', None))
+
+            # Get the translation and metrics
+            translated_text, metrics = translation_service.translate_text(
+                text_to_translate=text,
+                target_lang=target_lang,
+                source_lang=source_lang,
+                context=context
+            )
+
+            # Create new translation cache record
+            new_translation = TranslationCache(
+                cache_key=cache_key,
+                source_text=text,
+                translated_text=translated_text,
+                source_language=source_lang or 'auto',
+                target_language=target_lang,
+                context=context,
+                prompt_tokens=metrics.get('prompt_tokens', 0),
+                completion_tokens=metrics.get('completion_tokens', 0),
+                created_at=dt.now(tz=tz.utc),
+                created_by=getattr(current_user, 'id', None) if 'current_user' in globals() else None,
+                updated_at=dt.now(tz=tz.utc),
+                updated_by=getattr(current_user, 'id', None) if 'current_user' in globals() else None,
+                last_used_at=dt.now(tz=tz.utc)
+            )
+
+            # Save to database
+            db.session.add(new_translation)
+            db.session.commit()
+
+            return new_translation
+
+        return self.get(creator_func, text=text, target_lang=target_lang, source_lang=source_lang, context=context)
+
+    def invalidate_tenant_translations(self, tenant_id: int):
+        """Invalidate cached translations for specific tenant"""
+        self.invalidate(tenant_id=tenant_id)
+
+    def _generate_cache_key(self, text: str, target_lang: str, source_lang: str = None, context: str = None) -> str:
+        """Generate cache key for a translation"""
+        cache_data = {
+            "text": text.strip(),
+            "target_lang": target_lang.lower(),
+            "source_lang": source_lang.lower() if source_lang else None,
+            "context": context.strip() if context else None,
+        }
+
+        cache_string = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
+        return xxhash.xxh64(cache_string.encode('utf-8')).hexdigest()
+
+def register_translation_cache_handlers(cache_manager) -> None:
+    """Register translation cache handlers with cache manager"""
+    cache_manager.register_handler(
+        TranslationCacheHandler,
+        'eveai_model'  # Use existing eveai_model region
+    )
--- a/common/utils/document_utils.py
+++ b/common/utils/document_utils.py
@@ -3,7 +3,7 @@ from datetime import datetime as dt, timezone as tz
 from sqlalchemy import desc
 from sqlalchemy.exc import SQLAlchemyError
 from werkzeug.utils import secure_filename
-from common.models.document import Document, DocumentVersion, Catalog
+from common.models.document import Document, DocumentVersion, Catalog, Processor
 from common.extensions import db, minio_client
 from common.utils.celery_utils import current_celery
 from flask import current_app
@@ -11,6 +11,7 @@ import requests
 from urllib.parse import urlparse, unquote, urlunparse, parse_qs
 import os

+from config.type_defs.processor_types import PROCESSOR_TYPES
 from .config_field_types import normalize_json_field
 from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
                               EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException)
@@ -469,3 +470,15 @@ def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) -
            "Error during document lookup",
            status_code=500
        )
+
+def is_file_type_supported_by_catalog(catalog_id, file_type):
+    processors = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True).all()
+
+    supported_file_types = []
+    for processor in processors:
+        processor_file_types = PROCESSOR_TYPES[processor.type]['file_types']
+        file_types = [f.strip() for f in processor_file_types.split(",")]
+        supported_file_types.extend(file_types)
+
+    if file_type not in supported_file_types:
+        raise EveAIUnsupportedFileType()
--- a/common/utils/eveai_exceptions.py
+++ b/common/utils/eveai_exceptions.py
@@ -34,7 +34,25 @@ class EveAIDoubleURLException(EveAIException):
 class EveAIUnsupportedFileType(EveAIException):
    """Raised when an invalid file type is provided"""

-    def __init__(self, message="Filetype is not supported", status_code=400, payload=None):
+    def __init__(self, message="Filetype is not supported by current active processors", status_code=400, payload=None):
+        super().__init__(message, status_code, payload)
+
+
+class EveAINoProcessorFound(EveAIException):
+    """Raised when no processor is found for a given file type"""
+
+    def __init__(self, catalog_id, file_type, file_subtype, status_code=400, payload=None):
+        message = f"No active processor found for catalog {catalog_id} with file type {file_type} and subtype {file_subtype}"
+        super().__init__(message, status_code, payload)
+
+
+class EveAINoContentFound(EveAIException):
+    """Raised when no content is found for a given document"""
+
+    def __init__(self, document_id, document_version_id, status_code=400, payload=None):
+        self.document_id = document_id
+        self.document_version_id = document_version_id
+        message = f"No content found while processing Document with ID {document_id} and version {document_version_id}."
        super().__init__(message, status_code, payload)


--- a/common/utils/translation_utils.py
+++ b/common/utils/translation_utils.py
@@ -1,21 +0,0 @@
-import xxhash
-import json
-
-from common.utils.model_utils import get_template, replace_variable_in_template
-
-
-def generate_cache_key(text: str, target_lang: str, source_lang: str = None, context: str = None) -> str:
-    cache_data = {
-        "text": text.strip(),
-        "target_lang": target_lang.lower(),
-        "source_lang": source_lang.lower() if source_lang else None,
-        "context": context.strip() if context else ""
-    }
-
-    cache_string = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
-    return xxhash.xxh64(cache_string.encode('utf-8')).hexdigest()
-
-def translate_text(text: str, target_lang: str, source_lang: str = None, context: str = None) -> str:
-    if context:
-        prompt_text = get_template("translation_with_context")
-        prompt_text = replace_variable_in_template(prompt_text, "context", context)