diff --git a/common/langchain/llm_metrics_handler.py b/common/langchain/llm_metrics_handler.py index 8ccfe73..e9073b8 100644 --- a/common/langchain/llm_metrics_handler.py +++ b/common/langchain/llm_metrics_handler.py @@ -3,7 +3,6 @@ from langchain.callbacks.base import BaseCallbackHandler from typing import Dict, Any, List from langchain.schema import LLMResult from common.utils.business_event_context import current_event -from flask import current_app class LLMMetricsHandler(BaseCallbackHandler): diff --git a/common/langchain/persistent_llm_metrics_handler.py b/common/langchain/persistent_llm_metrics_handler.py new file mode 100644 index 0000000..ee11406 --- /dev/null +++ b/common/langchain/persistent_llm_metrics_handler.py @@ -0,0 +1,47 @@ +import time +from langchain.callbacks.base import BaseCallbackHandler +from typing import Dict, Any, List +from langchain.schema import LLMResult +from common.utils.business_event_context import current_event + + +class PersistentLLMMetricsHandler(BaseCallbackHandler): + """Metrics handler that allows metrics to be retrieved from within any call. In case metrics are required for other + purposes than business event logging.""" + + def __init__(self): + self.total_tokens: int = 0 + self.prompt_tokens: int = 0 + self.completion_tokens: int = 0 + self.start_time: float = 0 + self.end_time: float = 0 + self.total_time: float = 0 + + def reset(self): + self.total_tokens = 0 + self.prompt_tokens = 0 + self.completion_tokens = 0 + self.start_time = 0 + self.end_time = 0 + self.total_time = 0 + + def on_llm_start(self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any) -> None: + self.start_time = time.time() + + def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None: + self.end_time = time.time() + self.total_time = self.end_time - self.start_time + + usage = response.llm_output.get('token_usage', {}) + self.prompt_tokens += usage.get('prompt_tokens', 0) + self.completion_tokens += usage.get('completion_tokens', 0) + self.total_tokens = self.prompt_tokens + self.completion_tokens + + def get_metrics(self) -> Dict[str, int | float]: + return { + 'total_tokens': self.total_tokens, + 'prompt_tokens': self.prompt_tokens, + 'completion_tokens': self.completion_tokens, + 'time_elapsed': self.total_time, + 'interaction_type': 'LLM', + } diff --git a/common/models/document.py b/common/models/document.py index 81fa2e0..9b7a2d8 100644 --- a/common/models/document.py +++ b/common/models/document.py @@ -34,6 +34,7 @@ class Processor(db.Model): catalog_id = db.Column(db.Integer, db.ForeignKey('catalog.id'), nullable=True) type = db.Column(db.String(50), nullable=False) sub_file_type = db.Column(db.String(50), nullable=True) + active = db.Column(db.Boolean, nullable=True, default=True) # Tuning enablers tuning = db.Column(db.Boolean, nullable=True, default=False) diff --git a/common/models/user.py b/common/models/user.py index ca1a9ab..a29ff5d 100644 --- a/common/models/user.py +++ b/common/models/user.py @@ -331,8 +331,8 @@ class TranslationCache(db.Model): context = db.Column(db.Text, nullable=True) # Translation cost - input_tokens = db.Column(db.Integer, nullable=False) - output_tokens = db.Column(db.Integer, nullable=False) + prompt_tokens = db.Column(db.Integer, nullable=False) + completion_tokens = db.Column(db.Integer, nullable=False) # Tracking created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now()) diff --git a/common/services/utils/translation_services.py b/common/services/utils/translation_services.py new file mode 100644 index 0000000..d35fae0 --- /dev/null +++ b/common/services/utils/translation_services.py @@ -0,0 +1,43 @@ +import xxhash +import json + +from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.runnables import RunnablePassthrough + +from common.langchain.persistent_llm_metrics_handler import PersistentLLMMetricsHandler +from common.utils.model_utils import get_template, replace_variable_in_template + +class TranslationService: + def __init__(self, tenant_id): + self.tenant_id = tenant_id + + def translate_text(self, text_to_translate: str, target_lang: str, source_lang: str = None, context: str = None) -> tuple[ + str, dict[str, int | float]]: + prompt_params = { + "text_to_translate": text_to_translate, + "target_lang": target_lang, + } + if context: + template, llm = get_template("translation_with_context") + prompt_params["context"] = context + else: + template, llm = get_template("translation_without_context") + + # Add a metrics handler to capture usage + + metrics_handler = PersistentLLMMetricsHandler() + existing_callbacks = llm.callbacks + llm.callbacks = existing_callbacks + [metrics_handler] + + translation_prompt = ChatPromptTemplate.from_template(template) + + setup = RunnablePassthrough() + + chain = (setup | translation_prompt | llm | StrOutputParser()) + + translation = chain.invoke(prompt_params) + + metrics = metrics_handler.get_metrics() + + return translation, metrics \ No newline at end of file diff --git a/common/utils/cache/translation_cache.py b/common/utils/cache/translation_cache.py new file mode 100644 index 0000000..5e1e409 --- /dev/null +++ b/common/utils/cache/translation_cache.py @@ -0,0 +1,156 @@ +import json +from typing import Dict, Any, Optional +from datetime import datetime as dt, timezone as tz + +import xxhash +from flask import current_app +from sqlalchemy import and_ +from sqlalchemy.inspection import inspect + +from common.utils.cache.base import CacheHandler, T +from common.extensions import db + +from common.models.user import TranslationCache +from common.services.utils.translation_services import TranslationService +from flask_security import current_user + + +class TranslationCacheHandler(CacheHandler[TranslationCache]): + """Handles caching of translations with fallback to database and external translation service""" + handler_name = 'translation_cache' + + def __init__(self, region): + super().__init__(region, 'translation') + self.configure_keys('hash_key') + + def _to_cache_data(self, instance: TranslationCache) -> Dict[str, Any]: + """Convert TranslationCache instance to cache data using SQLAlchemy inspection""" + if not instance: + return {} + + mapper = inspect(TranslationCache) + data = {} + + for column in mapper.columns: + value = getattr(instance, column.name) + + # Handle date serialization + if isinstance(value, dt): + data[column.name] = value.isoformat() + else: + data[column.name] = value + + return data + + def _from_cache_data(self, data: Dict[str, Any], **kwargs) -> TranslationCache: + if not data: + return None + + # Create a new TranslationCache instance + translation = TranslationCache() + mapper = inspect(TranslationCache) + + # Set all attributes dynamically + for column in mapper.columns: + if column.name in data: + value = data[column.name] + + # Handle date deserialization + if column.name.endswith('_date') and value: + if isinstance(value, str): + value = dt.fromisoformat(value).date() + + setattr(translation, column.name, value) + + return translation + + def _should_cache(self, value: TranslationCache) -> bool: + """Validate if the translation should be cached""" + return value is not None and value.cache_key is not None + + def get_translation(self, text: str, target_lang: str, source_lang:str=None, context: str=None) -> Optional[TranslationCache]: + """ + Get the translation for a text in a specific language + + Args: + text: The text to be translated + target_lang: The target language for the translation + source_lang: The source language of the text to be translated + context: Optional context for the translation + + Returns: + TranslationCache instance if found, None otherwise + """ + + def creator_func(text: str, target_lang: str, source_lang: str=None, context: str=None) -> Optional[TranslationCache]: + # Generate cache key based on inputs + cache_key = self._generate_cache_key(text, target_lang, source_lang, context) + + # Check if translation already exists in database + existing_translation = db.session.query(TranslationCache).filter_by(cache_key=cache_key).first() + + if existing_translation: + # Update last used timestamp + existing_translation.last_used_at = dt.now(tz=tz.utc) + db.session.commit() + return existing_translation + + # Translation not found in DB, need to create it + # Initialize translation service + translation_service = TranslationService(getattr(current_app, 'tenant_id', None)) + + # Get the translation and metrics + translated_text, metrics = translation_service.translate_text( + text_to_translate=text, + target_lang=target_lang, + source_lang=source_lang, + context=context + ) + + # Create new translation cache record + new_translation = TranslationCache( + cache_key=cache_key, + source_text=text, + translated_text=translated_text, + source_language=source_lang or 'auto', + target_language=target_lang, + context=context, + prompt_tokens=metrics.get('prompt_tokens', 0), + completion_tokens=metrics.get('completion_tokens', 0), + created_at=dt.now(tz=tz.utc), + created_by=getattr(current_user, 'id', None) if 'current_user' in globals() else None, + updated_at=dt.now(tz=tz.utc), + updated_by=getattr(current_user, 'id', None) if 'current_user' in globals() else None, + last_used_at=dt.now(tz=tz.utc) + ) + + # Save to database + db.session.add(new_translation) + db.session.commit() + + return new_translation + + return self.get(creator_func, text=text, target_lang=target_lang, source_lang=source_lang, context=context) + + def invalidate_tenant_translations(self, tenant_id: int): + """Invalidate cached translations for specific tenant""" + self.invalidate(tenant_id=tenant_id) + + def _generate_cache_key(self, text: str, target_lang: str, source_lang: str = None, context: str = None) -> str: + """Generate cache key for a translation""" + cache_data = { + "text": text.strip(), + "target_lang": target_lang.lower(), + "source_lang": source_lang.lower() if source_lang else None, + "context": context.strip() if context else None, + } + + cache_string = json.dumps(cache_data, sort_keys=True, ensure_ascii=False) + return xxhash.xxh64(cache_string.encode('utf-8')).hexdigest() + +def register_translation_cache_handlers(cache_manager) -> None: + """Register translation cache handlers with cache manager""" + cache_manager.register_handler( + TranslationCacheHandler, + 'eveai_model' # Use existing eveai_model region + ) diff --git a/common/utils/document_utils.py b/common/utils/document_utils.py index a07b910..79d02b1 100644 --- a/common/utils/document_utils.py +++ b/common/utils/document_utils.py @@ -3,7 +3,7 @@ from datetime import datetime as dt, timezone as tz from sqlalchemy import desc from sqlalchemy.exc import SQLAlchemyError from werkzeug.utils import secure_filename -from common.models.document import Document, DocumentVersion, Catalog +from common.models.document import Document, DocumentVersion, Catalog, Processor from common.extensions import db, minio_client from common.utils.celery_utils import current_celery from flask import current_app @@ -11,6 +11,7 @@ import requests from urllib.parse import urlparse, unquote, urlunparse, parse_qs import os +from config.type_defs.processor_types import PROCESSOR_TYPES from .config_field_types import normalize_json_field from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType, EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException) @@ -469,3 +470,15 @@ def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) - "Error during document lookup", status_code=500 ) + +def is_file_type_supported_by_catalog(catalog_id, file_type): + processors = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True).all() + + supported_file_types = [] + for processor in processors: + processor_file_types = PROCESSOR_TYPES[processor.type]['file_types'] + file_types = [f.strip() for f in processor_file_types.split(",")] + supported_file_types.extend(file_types) + + if file_type not in supported_file_types: + raise EveAIUnsupportedFileType() \ No newline at end of file diff --git a/common/utils/eveai_exceptions.py b/common/utils/eveai_exceptions.py index ff6793b..f12f66c 100644 --- a/common/utils/eveai_exceptions.py +++ b/common/utils/eveai_exceptions.py @@ -34,7 +34,25 @@ class EveAIDoubleURLException(EveAIException): class EveAIUnsupportedFileType(EveAIException): """Raised when an invalid file type is provided""" - def __init__(self, message="Filetype is not supported", status_code=400, payload=None): + def __init__(self, message="Filetype is not supported by current active processors", status_code=400, payload=None): + super().__init__(message, status_code, payload) + + +class EveAINoProcessorFound(EveAIException): + """Raised when no processor is found for a given file type""" + + def __init__(self, catalog_id, file_type, file_subtype, status_code=400, payload=None): + message = f"No active processor found for catalog {catalog_id} with file type {file_type} and subtype {file_subtype}" + super().__init__(message, status_code, payload) + + +class EveAINoContentFound(EveAIException): + """Raised when no content is found for a given document""" + + def __init__(self, document_id, document_version_id, status_code=400, payload=None): + self.document_id = document_id + self.document_version_id = document_version_id + message = f"No content found while processing Document with ID {document_id} and version {document_version_id}." super().__init__(message, status_code, payload) diff --git a/common/utils/translation_utils.py b/common/utils/translation_utils.py deleted file mode 100644 index 67ddafe..0000000 --- a/common/utils/translation_utils.py +++ /dev/null @@ -1,21 +0,0 @@ -import xxhash -import json - -from common.utils.model_utils import get_template, replace_variable_in_template - - -def generate_cache_key(text: str, target_lang: str, source_lang: str = None, context: str = None) -> str: - cache_data = { - "text": text.strip(), - "target_lang": target_lang.lower(), - "source_lang": source_lang.lower() if source_lang else None, - "context": context.strip() if context else "" - } - - cache_string = json.dumps(cache_data, sort_keys=True, ensure_ascii=False) - return xxhash.xxh64(cache_string.encode('utf-8')).hexdigest() - -def translate_text(text: str, target_lang: str, source_lang: str = None, context: str = None) -> str: - if context: - prompt_text = get_template("translation_with_context") - prompt_text = replace_variable_in_template(prompt_text, "context", context) \ No newline at end of file diff --git a/config/processors/globals/AUTOMAGIC_HTML_PROCESSOR/1.0.0.yaml b/config/processors/globals/AUTOMAGIC_HTML_PROCESSOR/1.0.0.yaml new file mode 100644 index 0000000..a9c5d60 --- /dev/null +++ b/config/processors/globals/AUTOMAGIC_HTML_PROCESSOR/1.0.0.yaml @@ -0,0 +1,14 @@ +version: "1.0.0" +name: "HTML Processor" +file_types: "html" +description: "A processor for HTML files, driven by AI" +configuration: + custom_instructions: + name: "Custom Instructions" + description: "Some custom instruction to guide our AI agent in parsing your HTML file" + type: "text" + required: false +metadata: + author: "Josako" + date_added: "2025-06-25" + description: "A processor for HTML files, driven by AI" \ No newline at end of file diff --git a/config/prompts/globals/automagic_html_parse/1.0.0.yaml b/config/prompts/globals/automagic_html_parse/1.0.0.yaml new file mode 100644 index 0000000..68ddca7 --- /dev/null +++ b/config/prompts/globals/automagic_html_parse/1.0.0.yaml @@ -0,0 +1,30 @@ +version: "1.0.0" +content: | + You are a top administrative assistant specialized in transforming given HTML into markdown formatted files. The + generated files will be used to generate embeddings in a RAG-system. + + # Best practices are: + - Respect wordings and language(s) used in the HTML. + - The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected. + - Sub-headers can be used as lists. This is true when a header is followed by a series of sub-headers without content (paragraphs or listed items). Present those sub-headers as a list. + - Be careful of encoding of the text. Everything needs to be human readable. + + You only return relevant information, and filter out non-relevant information, such as: + - information found in menu bars, sidebars, footers or headers + - information in forms, buttons + + Process the file or text carefully, and take a stepped approach. The resulting markdown should be the result of the + processing of the complete input html file. Answer with the pure markdown, without any other text. + + {custom_instructions} + + HTML to be processed is in between triple backquotes. + + ```{html}``` + +llm_model: "mistral.mistral-small-latest" +metadata: + author: "Josako" + date_added: "2025-06-25" + description: "An aid in transforming HTML-based inputs to markdown, fully automatic" + changes: "Initial version" \ No newline at end of file diff --git a/config/prompts/globals/translation_with_context/1.0.0.yaml b/config/prompts/globals/translation_with_context/1.0.0.yaml index 3fd5d8d..b7c5be5 100644 --- a/config/prompts/globals/translation_with_context/1.0.0.yaml +++ b/config/prompts/globals/translation_with_context/1.0.0.yaml @@ -7,7 +7,7 @@ content: > I only want you to return the translation. No explanation, no options. I need to be able to directly use your answer without further interpretation. If more than one option is available, present me with the most probable one. - +llm_model: "mistral.ministral-8b-latest" metadata: author: "Josako" date_added: "2025-06-23" diff --git a/config/prompts/globals/translation_without_context/1.0.0.yaml b/config/prompts/globals/translation_without_context/1.0.0.yaml index 1eece0b..08d2990 100644 --- a/config/prompts/globals/translation_without_context/1.0.0.yaml +++ b/config/prompts/globals/translation_without_context/1.0.0.yaml @@ -4,7 +4,7 @@ content: > I only want you to return the translation. No explanation, no options. I need to be able to directly use your answer without further interpretation. If more than one option is available, present me with the most probable one. - +llm_model: "mistral.ministral-8b-latest" metadata: author: "Josako" date_added: "2025-06-23" diff --git a/config/type_defs/processor_types.py b/config/type_defs/processor_types.py index 4ac0eac..c8cc479 100644 --- a/config/type_defs/processor_types.py +++ b/config/type_defs/processor_types.py @@ -24,5 +24,10 @@ PROCESSOR_TYPES = { "name": "DOCX Processor", "description": "A processor for DOCX files", "file_types": "docx", - } + }, + "AUTOMAGIC_HTML_PROCESSOR": { + "name": "AutoMagic HTML Processor", + "description": "A processor for HTML files, driven by AI", + "file_types": "html, htm", + }, } diff --git a/docker/compose_dev.yaml b/docker/compose_dev.yaml index 289bf3c..30f5c05 100644 --- a/docker/compose_dev.yaml +++ b/docker/compose_dev.yaml @@ -24,7 +24,7 @@ x-common-variables: &common-variables FLOWER_PASSWORD: 'Jungles' OPENAI_API_KEY: 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7' GROQ_API_KEY: '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' - MISTRAL_API_KEY: 'jGDc6fkCbt0iOC0jQsbuZhcjLWBPGc2b' + MISTRAL_API_KEY: '0f4ZiQ1kIpgIKTHX8d0a8GOD2vAgVqEn' ANTHROPIC_API_KEY: 'sk-ant-api03-c2TmkzbReeGhXBO5JxNH6BJNylRDonc9GmZd0eRbrvyekec2' JWT_SECRET_KEY: 'bsdMkmQ8ObfMD52yAFg4trrvjgjMhuIqg2fjDpD/JqvgY0ccCcmlsEnVFmR79WPiLKEA3i8a5zmejwLZKl4v9Q==' API_ENCRYPTION_KEY: 'xfF5369IsredSrlrYZqkM9ZNrfUASYYS6TCcAR9UKj4=' diff --git a/docker/compose_test.yaml b/docker/compose_test.yaml index 03a5f80..7aedc10 100644 --- a/docker/compose_test.yaml +++ b/docker/compose_test.yaml @@ -26,7 +26,7 @@ x-common-variables: &common-variables REDIS_PORT: '6379' FLOWER_USER: 'Felucia' FLOWER_PASSWORD: 'Jungles' - MISTRAL_API_KEY: 'Vkwgr67vUs6ScKmcFF2QVw7uHKgq0WEN' + MISTRAL_API_KEY: 'qunKSaeOkFfLteNiUO77RCsXXSLK65Ec' JWT_SECRET_KEY: '7e9c8b3a215f4d6e90712c5d8f3b97a60e482c15f39a7d68bcd45910ef23a784' API_ENCRYPTION_KEY: 'kJ7N9p3IstyRGkluYTryM8ZMnfUBSXWR3TCfDG9VLc4=' MINIO_ENDPOINT: minio:9000 diff --git a/eveai_app/__init__.py b/eveai_app/__init__.py index d3c8f0a..54bb883 100644 --- a/eveai_app/__init__.py +++ b/eveai_app/__init__.py @@ -201,8 +201,3 @@ def register_cache_handlers(app): register_specialist_cache_handlers(cache_manager) from common.utils.cache.license_cache import register_license_cache_handlers register_license_cache_handlers(cache_manager) - - - - - diff --git a/eveai_app/templates/document/document_versions.html b/eveai_app/templates/document/document_versions.html index 9ddc7f0..a29c563 100644 --- a/eveai_app/templates/document/document_versions.html +++ b/eveai_app/templates/document/document_versions.html @@ -4,13 +4,13 @@ {% block title %}Document Versions{% endblock %} {% block content_title %}Document Versions{% endblock %} -{% block content_description %}View Versions for {{ document }}{% endblock %} +{% block content_description %}View Versions for Document {{ document }}{% endblock %} {% block content_class %}
{% endblock %} {% block content %}