Compare commits
4 Commits
v2.3.7-alf
...
v2.3.8-alf
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4338f09f5c | ||
|
|
53e32a67bd | ||
|
|
fda267b479 | ||
|
|
f5c9542a49 |
@@ -3,7 +3,6 @@ from langchain.callbacks.base import BaseCallbackHandler
|
||||
from typing import Dict, Any, List
|
||||
from langchain.schema import LLMResult
|
||||
from common.utils.business_event_context import current_event
|
||||
from flask import current_app
|
||||
|
||||
|
||||
class LLMMetricsHandler(BaseCallbackHandler):
|
||||
|
||||
47
common/langchain/persistent_llm_metrics_handler.py
Normal file
47
common/langchain/persistent_llm_metrics_handler.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import time
|
||||
from langchain.callbacks.base import BaseCallbackHandler
|
||||
from typing import Dict, Any, List
|
||||
from langchain.schema import LLMResult
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class PersistentLLMMetricsHandler(BaseCallbackHandler):
|
||||
"""Metrics handler that allows metrics to be retrieved from within any call. In case metrics are required for other
|
||||
purposes than business event logging."""
|
||||
|
||||
def __init__(self):
|
||||
self.total_tokens: int = 0
|
||||
self.prompt_tokens: int = 0
|
||||
self.completion_tokens: int = 0
|
||||
self.start_time: float = 0
|
||||
self.end_time: float = 0
|
||||
self.total_time: float = 0
|
||||
|
||||
def reset(self):
|
||||
self.total_tokens = 0
|
||||
self.prompt_tokens = 0
|
||||
self.completion_tokens = 0
|
||||
self.start_time = 0
|
||||
self.end_time = 0
|
||||
self.total_time = 0
|
||||
|
||||
def on_llm_start(self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any) -> None:
|
||||
self.start_time = time.time()
|
||||
|
||||
def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
|
||||
self.end_time = time.time()
|
||||
self.total_time = self.end_time - self.start_time
|
||||
|
||||
usage = response.llm_output.get('token_usage', {})
|
||||
self.prompt_tokens += usage.get('prompt_tokens', 0)
|
||||
self.completion_tokens += usage.get('completion_tokens', 0)
|
||||
self.total_tokens = self.prompt_tokens + self.completion_tokens
|
||||
|
||||
def get_metrics(self) -> Dict[str, int | float]:
|
||||
return {
|
||||
'total_tokens': self.total_tokens,
|
||||
'prompt_tokens': self.prompt_tokens,
|
||||
'completion_tokens': self.completion_tokens,
|
||||
'time_elapsed': self.total_time,
|
||||
'interaction_type': 'LLM',
|
||||
}
|
||||
@@ -34,6 +34,7 @@ class Processor(db.Model):
|
||||
catalog_id = db.Column(db.Integer, db.ForeignKey('catalog.id'), nullable=True)
|
||||
type = db.Column(db.String(50), nullable=False)
|
||||
sub_file_type = db.Column(db.String(50), nullable=True)
|
||||
active = db.Column(db.Boolean, nullable=True, default=True)
|
||||
|
||||
# Tuning enablers
|
||||
tuning = db.Column(db.Boolean, nullable=True, default=False)
|
||||
|
||||
@@ -186,6 +186,7 @@ class TenantMake(db.Model):
|
||||
active = db.Column(db.Boolean, nullable=False, default=True)
|
||||
website = db.Column(db.String(255), nullable=True)
|
||||
logo_url = db.Column(db.String(255), nullable=True)
|
||||
allowed_languages = db.Column(ARRAY(sa.String(2)), nullable=True)
|
||||
|
||||
# Chat customisation options
|
||||
chat_customisation_options = db.Column(JSONB, nullable=True)
|
||||
@@ -317,3 +318,27 @@ class SpecialistMagicLinkTenant(db.Model):
|
||||
|
||||
magic_link_code = db.Column(db.String(55), primary_key=True)
|
||||
tenant_id = db.Column(db.Integer, db.ForeignKey('public.tenant.id'), nullable=False)
|
||||
|
||||
|
||||
class TranslationCache(db.Model):
|
||||
__bind_key__ = 'public'
|
||||
__table_args__ = {'schema': 'public'}
|
||||
|
||||
cache_key = db.Column(db.String(16), primary_key=True)
|
||||
source_text = db.Column(db.Text, nullable=False)
|
||||
translated_text = db.Column(db.Text, nullable=False)
|
||||
source_language = db.Column(db.String(2), nullable=False)
|
||||
target_language = db.Column(db.String(2), nullable=False)
|
||||
context = db.Column(db.Text, nullable=True)
|
||||
|
||||
# Translation cost
|
||||
prompt_tokens = db.Column(db.Integer, nullable=False)
|
||||
completion_tokens = db.Column(db.Integer, nullable=False)
|
||||
|
||||
# Tracking
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
||||
created_by = db.Column(db.Integer, db.ForeignKey('public.user.id'), nullable=True)
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
||||
updated_by = db.Column(db.Integer, db.ForeignKey('public.user.id'), nullable=True)
|
||||
|
||||
last_used_at = db.Column(db.DateTime, nullable=True)
|
||||
|
||||
43
common/services/utils/translation_services.py
Normal file
43
common/services/utils/translation_services.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import xxhash
|
||||
import json
|
||||
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.langchain.persistent_llm_metrics_handler import PersistentLLMMetricsHandler
|
||||
from common.utils.model_utils import get_template, replace_variable_in_template
|
||||
|
||||
class TranslationService:
|
||||
def __init__(self, tenant_id):
|
||||
self.tenant_id = tenant_id
|
||||
|
||||
def translate_text(self, text_to_translate: str, target_lang: str, source_lang: str = None, context: str = None) -> tuple[
|
||||
str, dict[str, int | float]]:
|
||||
prompt_params = {
|
||||
"text_to_translate": text_to_translate,
|
||||
"target_lang": target_lang,
|
||||
}
|
||||
if context:
|
||||
template, llm = get_template("translation_with_context")
|
||||
prompt_params["context"] = context
|
||||
else:
|
||||
template, llm = get_template("translation_without_context")
|
||||
|
||||
# Add a metrics handler to capture usage
|
||||
|
||||
metrics_handler = PersistentLLMMetricsHandler()
|
||||
existing_callbacks = llm.callbacks
|
||||
llm.callbacks = existing_callbacks + [metrics_handler]
|
||||
|
||||
translation_prompt = ChatPromptTemplate.from_template(template)
|
||||
|
||||
setup = RunnablePassthrough()
|
||||
|
||||
chain = (setup | translation_prompt | llm | StrOutputParser())
|
||||
|
||||
translation = chain.invoke(prompt_params)
|
||||
|
||||
metrics = metrics_handler.get_metrics()
|
||||
|
||||
return translation, metrics
|
||||
156
common/utils/cache/translation_cache.py
vendored
Normal file
156
common/utils/cache/translation_cache.py
vendored
Normal file
@@ -0,0 +1,156 @@
|
||||
import json
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
|
||||
import xxhash
|
||||
from flask import current_app
|
||||
from sqlalchemy import and_
|
||||
from sqlalchemy.inspection import inspect
|
||||
|
||||
from common.utils.cache.base import CacheHandler, T
|
||||
from common.extensions import db
|
||||
|
||||
from common.models.user import TranslationCache
|
||||
from common.services.utils.translation_services import TranslationService
|
||||
from flask_security import current_user
|
||||
|
||||
|
||||
class TranslationCacheHandler(CacheHandler[TranslationCache]):
|
||||
"""Handles caching of translations with fallback to database and external translation service"""
|
||||
handler_name = 'translation_cache'
|
||||
|
||||
def __init__(self, region):
|
||||
super().__init__(region, 'translation')
|
||||
self.configure_keys('hash_key')
|
||||
|
||||
def _to_cache_data(self, instance: TranslationCache) -> Dict[str, Any]:
|
||||
"""Convert TranslationCache instance to cache data using SQLAlchemy inspection"""
|
||||
if not instance:
|
||||
return {}
|
||||
|
||||
mapper = inspect(TranslationCache)
|
||||
data = {}
|
||||
|
||||
for column in mapper.columns:
|
||||
value = getattr(instance, column.name)
|
||||
|
||||
# Handle date serialization
|
||||
if isinstance(value, dt):
|
||||
data[column.name] = value.isoformat()
|
||||
else:
|
||||
data[column.name] = value
|
||||
|
||||
return data
|
||||
|
||||
def _from_cache_data(self, data: Dict[str, Any], **kwargs) -> TranslationCache:
|
||||
if not data:
|
||||
return None
|
||||
|
||||
# Create a new TranslationCache instance
|
||||
translation = TranslationCache()
|
||||
mapper = inspect(TranslationCache)
|
||||
|
||||
# Set all attributes dynamically
|
||||
for column in mapper.columns:
|
||||
if column.name in data:
|
||||
value = data[column.name]
|
||||
|
||||
# Handle date deserialization
|
||||
if column.name.endswith('_date') and value:
|
||||
if isinstance(value, str):
|
||||
value = dt.fromisoformat(value).date()
|
||||
|
||||
setattr(translation, column.name, value)
|
||||
|
||||
return translation
|
||||
|
||||
def _should_cache(self, value: TranslationCache) -> bool:
|
||||
"""Validate if the translation should be cached"""
|
||||
return value is not None and value.cache_key is not None
|
||||
|
||||
def get_translation(self, text: str, target_lang: str, source_lang:str=None, context: str=None) -> Optional[TranslationCache]:
|
||||
"""
|
||||
Get the translation for a text in a specific language
|
||||
|
||||
Args:
|
||||
text: The text to be translated
|
||||
target_lang: The target language for the translation
|
||||
source_lang: The source language of the text to be translated
|
||||
context: Optional context for the translation
|
||||
|
||||
Returns:
|
||||
TranslationCache instance if found, None otherwise
|
||||
"""
|
||||
|
||||
def creator_func(text: str, target_lang: str, source_lang: str=None, context: str=None) -> Optional[TranslationCache]:
|
||||
# Generate cache key based on inputs
|
||||
cache_key = self._generate_cache_key(text, target_lang, source_lang, context)
|
||||
|
||||
# Check if translation already exists in database
|
||||
existing_translation = db.session.query(TranslationCache).filter_by(cache_key=cache_key).first()
|
||||
|
||||
if existing_translation:
|
||||
# Update last used timestamp
|
||||
existing_translation.last_used_at = dt.now(tz=tz.utc)
|
||||
db.session.commit()
|
||||
return existing_translation
|
||||
|
||||
# Translation not found in DB, need to create it
|
||||
# Initialize translation service
|
||||
translation_service = TranslationService(getattr(current_app, 'tenant_id', None))
|
||||
|
||||
# Get the translation and metrics
|
||||
translated_text, metrics = translation_service.translate_text(
|
||||
text_to_translate=text,
|
||||
target_lang=target_lang,
|
||||
source_lang=source_lang,
|
||||
context=context
|
||||
)
|
||||
|
||||
# Create new translation cache record
|
||||
new_translation = TranslationCache(
|
||||
cache_key=cache_key,
|
||||
source_text=text,
|
||||
translated_text=translated_text,
|
||||
source_language=source_lang or 'auto',
|
||||
target_language=target_lang,
|
||||
context=context,
|
||||
prompt_tokens=metrics.get('prompt_tokens', 0),
|
||||
completion_tokens=metrics.get('completion_tokens', 0),
|
||||
created_at=dt.now(tz=tz.utc),
|
||||
created_by=getattr(current_user, 'id', None) if 'current_user' in globals() else None,
|
||||
updated_at=dt.now(tz=tz.utc),
|
||||
updated_by=getattr(current_user, 'id', None) if 'current_user' in globals() else None,
|
||||
last_used_at=dt.now(tz=tz.utc)
|
||||
)
|
||||
|
||||
# Save to database
|
||||
db.session.add(new_translation)
|
||||
db.session.commit()
|
||||
|
||||
return new_translation
|
||||
|
||||
return self.get(creator_func, text=text, target_lang=target_lang, source_lang=source_lang, context=context)
|
||||
|
||||
def invalidate_tenant_translations(self, tenant_id: int):
|
||||
"""Invalidate cached translations for specific tenant"""
|
||||
self.invalidate(tenant_id=tenant_id)
|
||||
|
||||
def _generate_cache_key(self, text: str, target_lang: str, source_lang: str = None, context: str = None) -> str:
|
||||
"""Generate cache key for a translation"""
|
||||
cache_data = {
|
||||
"text": text.strip(),
|
||||
"target_lang": target_lang.lower(),
|
||||
"source_lang": source_lang.lower() if source_lang else None,
|
||||
"context": context.strip() if context else None,
|
||||
}
|
||||
|
||||
cache_string = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
|
||||
return xxhash.xxh64(cache_string.encode('utf-8')).hexdigest()
|
||||
|
||||
def register_translation_cache_handlers(cache_manager) -> None:
|
||||
"""Register translation cache handlers with cache manager"""
|
||||
cache_manager.register_handler(
|
||||
TranslationCacheHandler,
|
||||
'eveai_model' # Use existing eveai_model region
|
||||
)
|
||||
@@ -1,14 +1,18 @@
|
||||
import json
|
||||
|
||||
"""
|
||||
Utility functions for chat customization.
|
||||
"""
|
||||
from flask import current_app
|
||||
|
||||
|
||||
def get_default_chat_customisation(tenant_customisation=None):
|
||||
"""
|
||||
Get chat customization options with default values for missing options.
|
||||
|
||||
Args:
|
||||
tenant_customization (dict, optional): The tenant's customization options.
|
||||
Defaults to None.
|
||||
tenant_customisation (dict or str, optional): The tenant's customization options.
|
||||
Defaults to None. Can be a dict or a JSON string.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing all customization options with default values
|
||||
@@ -37,9 +41,20 @@ def get_default_chat_customisation(tenant_customisation=None):
|
||||
# Start with the default customization
|
||||
customisation = default_customisation.copy()
|
||||
|
||||
# Convert JSON string to dict if needed
|
||||
if isinstance(tenant_customisation, str):
|
||||
try:
|
||||
tenant_customisation = json.loads(tenant_customisation)
|
||||
current_app.logger.debug(f"Converted JSON string to dict: {tenant_customisation}")
|
||||
except json.JSONDecodeError as e:
|
||||
current_app.logger.error(f"Error parsing JSON customisation: {e}")
|
||||
return default_customisation
|
||||
|
||||
# Update with tenant customization
|
||||
for key, value in tenant_customisation.items():
|
||||
if key in customisation:
|
||||
customisation[key] = value
|
||||
current_app.logger.debug(f"Tenant customisation - in default creation: {tenant_customisation}")
|
||||
if tenant_customisation:
|
||||
for key, value in tenant_customisation.items():
|
||||
if key in customisation:
|
||||
customisation[key] = value
|
||||
|
||||
return customisation
|
||||
|
||||
@@ -3,7 +3,7 @@ from datetime import datetime as dt, timezone as tz
|
||||
from sqlalchemy import desc
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from werkzeug.utils import secure_filename
|
||||
from common.models.document import Document, DocumentVersion, Catalog
|
||||
from common.models.document import Document, DocumentVersion, Catalog, Processor
|
||||
from common.extensions import db, minio_client
|
||||
from common.utils.celery_utils import current_celery
|
||||
from flask import current_app
|
||||
@@ -11,6 +11,7 @@ import requests
|
||||
from urllib.parse import urlparse, unquote, urlunparse, parse_qs
|
||||
import os
|
||||
|
||||
from config.type_defs.processor_types import PROCESSOR_TYPES
|
||||
from .config_field_types import normalize_json_field
|
||||
from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
|
||||
EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException)
|
||||
@@ -469,3 +470,15 @@ def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) -
|
||||
"Error during document lookup",
|
||||
status_code=500
|
||||
)
|
||||
|
||||
def is_file_type_supported_by_catalog(catalog_id, file_type):
|
||||
processors = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True).all()
|
||||
|
||||
supported_file_types = []
|
||||
for processor in processors:
|
||||
processor_file_types = PROCESSOR_TYPES[processor.type]['file_types']
|
||||
file_types = [f.strip() for f in processor_file_types.split(",")]
|
||||
supported_file_types.extend(file_types)
|
||||
|
||||
if file_type not in supported_file_types:
|
||||
raise EveAIUnsupportedFileType()
|
||||
@@ -34,7 +34,25 @@ class EveAIDoubleURLException(EveAIException):
|
||||
class EveAIUnsupportedFileType(EveAIException):
|
||||
"""Raised when an invalid file type is provided"""
|
||||
|
||||
def __init__(self, message="Filetype is not supported", status_code=400, payload=None):
|
||||
def __init__(self, message="Filetype is not supported by current active processors", status_code=400, payload=None):
|
||||
super().__init__(message, status_code, payload)
|
||||
|
||||
|
||||
class EveAINoProcessorFound(EveAIException):
|
||||
"""Raised when no processor is found for a given file type"""
|
||||
|
||||
def __init__(self, catalog_id, file_type, file_subtype, status_code=400, payload=None):
|
||||
message = f"No active processor found for catalog {catalog_id} with file type {file_type} and subtype {file_subtype}"
|
||||
super().__init__(message, status_code, payload)
|
||||
|
||||
|
||||
class EveAINoContentFound(EveAIException):
|
||||
"""Raised when no content is found for a given document"""
|
||||
|
||||
def __init__(self, document_id, document_version_id, status_code=400, payload=None):
|
||||
self.document_id = document_id
|
||||
self.document_version_id = document_version_id
|
||||
message = f"No content found while processing Document with ID {document_id} and version {document_version_id}."
|
||||
super().__init__(message, status_code, payload)
|
||||
|
||||
|
||||
|
||||
@@ -148,7 +148,7 @@ class Config(object):
|
||||
},
|
||||
}
|
||||
|
||||
SUPPORTED_LANGUAGES_Full = list(SUPPORTED_LANGUAGE_DETAILS.keys())
|
||||
SUPPORTED_LANGUAGES_FULL = list(SUPPORTED_LANGUAGE_DETAILS.keys())
|
||||
|
||||
# supported currencies
|
||||
SUPPORTED_CURRENCIES = ['€', '$']
|
||||
@@ -156,10 +156,7 @@ class Config(object):
|
||||
# supported LLMs
|
||||
# SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
|
||||
SUPPORTED_EMBEDDINGS = ['mistral.mistral-embed']
|
||||
SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4o-mini',
|
||||
'mistral.mistral-large-latest', 'mistral.mistral-medium_latest', 'mistral.mistral-small-latest']
|
||||
|
||||
ANTHROPIC_LLM_VERSIONS = {'claude-3-5-sonnet': 'claude-3-5-sonnet-20240620', }
|
||||
SUPPORTED_LLMS = ['mistral.mistral-large-latest', 'mistral.mistral-medium_latest', 'mistral.mistral-small-latest']
|
||||
|
||||
# Annotation text chunk length
|
||||
ANNOTATION_TEXT_CHUNK_LENGTH = 10000
|
||||
|
||||
@@ -56,11 +56,6 @@ configuration:
|
||||
description: "Sidebar Markdown-formatted Text"
|
||||
type: "text"
|
||||
required: false
|
||||
"welcome_message":
|
||||
name: "Welcome Message"
|
||||
description: "Text to be shown as Welcome"
|
||||
type: "text"
|
||||
required: false
|
||||
metadata:
|
||||
author: "Josako"
|
||||
date_added: "2024-06-06"
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
version: "1.0.0"
|
||||
name: "HTML Processor"
|
||||
file_types: "html"
|
||||
description: "A processor for HTML files, driven by AI"
|
||||
configuration:
|
||||
custom_instructions:
|
||||
name: "Custom Instructions"
|
||||
description: "Some custom instruction to guide our AI agent in parsing your HTML file"
|
||||
type: "text"
|
||||
required: false
|
||||
metadata:
|
||||
author: "Josako"
|
||||
date_added: "2025-06-25"
|
||||
description: "A processor for HTML files, driven by AI"
|
||||
30
config/prompts/globals/automagic_html_parse/1.0.0.yaml
Normal file
30
config/prompts/globals/automagic_html_parse/1.0.0.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
version: "1.0.0"
|
||||
content: |
|
||||
You are a top administrative assistant specialized in transforming given HTML into markdown formatted files. The
|
||||
generated files will be used to generate embeddings in a RAG-system.
|
||||
|
||||
# Best practices are:
|
||||
- Respect wordings and language(s) used in the HTML.
|
||||
- The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
|
||||
- Sub-headers can be used as lists. This is true when a header is followed by a series of sub-headers without content (paragraphs or listed items). Present those sub-headers as a list.
|
||||
- Be careful of encoding of the text. Everything needs to be human readable.
|
||||
|
||||
You only return relevant information, and filter out non-relevant information, such as:
|
||||
- information found in menu bars, sidebars, footers or headers
|
||||
- information in forms, buttons
|
||||
|
||||
Process the file or text carefully, and take a stepped approach. The resulting markdown should be the result of the
|
||||
processing of the complete input html file. Answer with the pure markdown, without any other text.
|
||||
|
||||
{custom_instructions}
|
||||
|
||||
HTML to be processed is in between triple backquotes.
|
||||
|
||||
```{html}```
|
||||
|
||||
llm_model: "mistral.mistral-small-latest"
|
||||
metadata:
|
||||
author: "Josako"
|
||||
date_added: "2025-06-25"
|
||||
description: "An aid in transforming HTML-based inputs to markdown, fully automatic"
|
||||
changes: "Initial version"
|
||||
15
config/prompts/globals/translation_with_context/1.0.0.yaml
Normal file
15
config/prompts/globals/translation_with_context/1.0.0.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
version: "1.0.0"
|
||||
content: >
|
||||
You are a top translator. We need you to translate {text_to_translate} into {target_language}, taking into account
|
||||
this context:
|
||||
|
||||
{context}
|
||||
|
||||
I only want you to return the translation. No explanation, no options. I need to be able to directly use your answer
|
||||
without further interpretation. If more than one option is available, present me with the most probable one.
|
||||
llm_model: "mistral.ministral-8b-latest"
|
||||
metadata:
|
||||
author: "Josako"
|
||||
date_added: "2025-06-23"
|
||||
description: "An assistant to translate given a context."
|
||||
changes: "Initial version"
|
||||
@@ -0,0 +1,12 @@
|
||||
version: "1.0.0"
|
||||
content: >
|
||||
You are a top translator. We need you to translate {text_to_translate} into {target_language}.
|
||||
|
||||
I only want you to return the translation. No explanation, no options. I need to be able to directly use your answer
|
||||
without further interpretation. If more than one option is available, present me with the most probable one.
|
||||
llm_model: "mistral.ministral-8b-latest"
|
||||
metadata:
|
||||
author: "Josako"
|
||||
date_added: "2025-06-23"
|
||||
description: "An assistant to translate without context."
|
||||
changes: "Initial version"
|
||||
@@ -24,5 +24,10 @@ PROCESSOR_TYPES = {
|
||||
"name": "DOCX Processor",
|
||||
"description": "A processor for DOCX files",
|
||||
"file_types": "docx",
|
||||
}
|
||||
},
|
||||
"AUTOMAGIC_HTML_PROCESSOR": {
|
||||
"name": "AutoMagic HTML Processor",
|
||||
"description": "A processor for HTML files, driven by AI",
|
||||
"file_types": "html, htm",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -28,4 +28,12 @@ PROMPT_TYPES = {
|
||||
"name": "transcript",
|
||||
"description": "An assistant to transform a transcript to markdown.",
|
||||
},
|
||||
"translation_with_context": {
|
||||
"name": "translation_with_context",
|
||||
"description": "An assistant to translate text with context",
|
||||
},
|
||||
"translation_without_context": {
|
||||
"name": "translation_without_context",
|
||||
"description": "An assistant to translate text without context",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -5,6 +5,36 @@ All notable changes to EveAI will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [2.3.8-alfa]
|
||||
|
||||
### Added
|
||||
- Translation Service
|
||||
- Automagic HTML Processor
|
||||
- Allowed languages defined at level of Tenant Make
|
||||
|
||||
### Changed
|
||||
- For changes in existing functionality.
|
||||
- Allow to activate / de-activate Processors
|
||||
- Align all document views with session catalog
|
||||
- Allow different processor types to handle the same file types
|
||||
- Remove welcome message from tenant_make customisation, add to specialist configuration
|
||||
|
||||
### Deprecated
|
||||
- For soon-to-be removed features.
|
||||
|
||||
### Removed
|
||||
- For now removed features.
|
||||
|
||||
### Fixed
|
||||
- Adapt TRAICIE_ROLE_DEFINITION_SPECIALIST to latest requirements
|
||||
- Allow for empty historical messages
|
||||
- Ensure client can cope with empty customisation options
|
||||
- Ensure only tenant-defined makes are selectable throughout the application
|
||||
- Refresh partner info when adding Partner Services
|
||||
|
||||
### Security
|
||||
- In case of vulnerabilities.
|
||||
|
||||
## [2.3.7-alfa]
|
||||
|
||||
### Added
|
||||
|
||||
@@ -24,7 +24,7 @@ x-common-variables: &common-variables
|
||||
FLOWER_PASSWORD: 'Jungles'
|
||||
OPENAI_API_KEY: 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7'
|
||||
GROQ_API_KEY: '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'
|
||||
MISTRAL_API_KEY: 'jGDc6fkCbt0iOC0jQsbuZhcjLWBPGc2b'
|
||||
MISTRAL_API_KEY: '0f4ZiQ1kIpgIKTHX8d0a8GOD2vAgVqEn'
|
||||
ANTHROPIC_API_KEY: 'sk-ant-api03-c2TmkzbReeGhXBO5JxNH6BJNylRDonc9GmZd0eRbrvyekec2'
|
||||
JWT_SECRET_KEY: 'bsdMkmQ8ObfMD52yAFg4trrvjgjMhuIqg2fjDpD/JqvgY0ccCcmlsEnVFmR79WPiLKEA3i8a5zmejwLZKl4v9Q=='
|
||||
API_ENCRYPTION_KEY: 'xfF5369IsredSrlrYZqkM9ZNrfUASYYS6TCcAR9UKj4='
|
||||
|
||||
@@ -26,7 +26,7 @@ x-common-variables: &common-variables
|
||||
REDIS_PORT: '6379'
|
||||
FLOWER_USER: 'Felucia'
|
||||
FLOWER_PASSWORD: 'Jungles'
|
||||
MISTRAL_API_KEY: 'Vkwgr67vUs6ScKmcFF2QVw7uHKgq0WEN'
|
||||
MISTRAL_API_KEY: 'qunKSaeOkFfLteNiUO77RCsXXSLK65Ec'
|
||||
JWT_SECRET_KEY: '7e9c8b3a215f4d6e90712c5d8f3b97a60e482c15f39a7d68bcd45910ef23a784'
|
||||
API_ENCRYPTION_KEY: 'kJ7N9p3IstyRGkluYTryM8ZMnfUBSXWR3TCfDG9VLc4='
|
||||
MINIO_ENDPOINT: minio:9000
|
||||
|
||||
@@ -201,8 +201,3 @@ def register_cache_handlers(app):
|
||||
register_specialist_cache_handlers(cache_manager)
|
||||
from common.utils.cache.license_cache import register_license_cache_handlers
|
||||
register_license_cache_handlers(cache_manager)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
{% block title %}Document Versions{% endblock %}
|
||||
|
||||
{% block content_title %}Document Versions{% endblock %}
|
||||
{% block content_description %}View Versions for {{ document }}{% endblock %}
|
||||
{% block content_description %}View Versions for Document <b>{{ document }}</b>{% endblock %}
|
||||
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<form method="POST" action="{{ url_for('document_bp.handle_document_version_selection') }}" id="documentVersionsForm">
|
||||
{{ render_selectable_table(headers=["ID", "URL", "Object Name", "File Type", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
|
||||
{{ render_selectable_table(headers=["ID", "File Type", "File Size", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
|
||||
<div class="form-group mt-3 d-flex justify-content-between">
|
||||
<div>
|
||||
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary" onclick="return validateTableSelection('documentVersionsForm')">Edit Document Version</button>
|
||||
|
||||
@@ -4,14 +4,13 @@
|
||||
{% block title %}Documents{% endblock %}
|
||||
|
||||
{% block content_title %}Documents{% endblock %}
|
||||
{% block content_description %}View Documents for Tenant{% endblock %}
|
||||
{% block content_description %}View Documents for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
|
||||
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<!-- Filter Form -->
|
||||
{% set filter_form %}
|
||||
<form method="GET" action="{{ url_for('document_bp.documents') }}">
|
||||
{{ render_filter_field('catalog_id', 'Catalog', filter_options['catalog_id'], filters.get('catalog_id', [])) }}
|
||||
{{ render_filter_field('validity', 'Validity', filter_options['validity'], filters.get('validity', [])) }}
|
||||
|
||||
<button type="submit" class="btn btn-primary">Apply Filters</button>
|
||||
@@ -27,7 +26,6 @@
|
||||
headers=[
|
||||
{"text": "ID", "sort": "id"},
|
||||
{"text": "Name", "sort": "name"},
|
||||
{"text": "Catalog", "sort": "catalog_name"},
|
||||
{"text": "Valid From", "sort": "valid_from"},
|
||||
{"text": "Valid To", "sort": "valid_to"}
|
||||
],
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
{% block title %}Edit Processor{% endblock %}
|
||||
|
||||
{% block content_title %}Edit Processor{% endblock %}
|
||||
{% block content_description %}Edit a Processor (for a Catalog){% endblock %}
|
||||
{% block content_description %}Edit Processor for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<form method="post">
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
{% block title %}Edit Retriever{% endblock %}
|
||||
|
||||
{% block content_title %}Edit Retriever{% endblock %}
|
||||
{% block content_description %}Edit a Retriever (for a Catalog){% endblock %}
|
||||
{% block content_description %}Edit a Retriever for catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<form method="post">
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
{% block title %}Processor Registration{% endblock %}
|
||||
|
||||
{% block content_title %}Register Processor{% endblock %}
|
||||
{% block content_description %}Define a new processor (for a catalog){% endblock %}
|
||||
{% block content_description %}Define a new processor for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<form method="post">
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
{% block title %}Processors{% endblock %}
|
||||
|
||||
{% block content_title %}Processors{% endblock %}
|
||||
{% block content_description %}View Processors for Tenant{% endblock %}
|
||||
{% block content_description %}View Processors for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
|
||||
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<form method="POST" action="{{ url_for('document_bp.handle_processor_selection') }}" id="processorsForm">
|
||||
{{ render_selectable_table(headers=["Processor ID", "Name", "Type", "Catalog ID"], rows=rows, selectable=True, id="retrieversTable") }}
|
||||
{{ render_selectable_table(headers=["Processor ID", "Name", "Type", "Active"], rows=rows, selectable=True, id="retrieversTable") }}
|
||||
<div class="form-group mt-3 d-flex justify-content-between">
|
||||
<div>
|
||||
<button type="submit" name="action" value="edit_processor" class="btn btn-primary" onclick="return validateTableSelection('processorsForm')">Edit Processor</button>
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
{% block title %}Retriever Registration{% endblock %}
|
||||
|
||||
{% block content_title %}Register Retriever{% endblock %}
|
||||
{% block content_description %}Define a new retriever (for a catalog){% endblock %}
|
||||
{% block content_description %}Define a new retriever for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<form method="post">
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
{% block title %}Retrievers{% endblock %}
|
||||
|
||||
{% block content_title %}Retrievers{% endblock %}
|
||||
{% block content_description %}View Retrievers for Tenant{% endblock %}
|
||||
{% block content_description %}View Retrievers for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
|
||||
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<form method="POST" action="{{ url_for('document_bp.handle_retriever_selection') }}" id="retrieversForm">
|
||||
{{ render_selectable_table(headers=["Retriever ID", "Name", "Type", "Catalog ID"], rows=rows, selectable=True, id="retrieversTable") }}
|
||||
{{ render_selectable_table(headers=["Retriever ID", "Name", "Type"], rows=rows, selectable=True, id="retrieversTable") }}
|
||||
<div class="form-group mt-3 d-flex justify-content-between">
|
||||
<div>
|
||||
<button type="submit" name="action" value="edit_retriever" class="btn btn-primary" onclick="return validateTableSelection('retrieversForm')">Edit Retriever</button>
|
||||
|
||||
@@ -71,15 +71,6 @@ class ProcessorForm(FlaskForm):
|
||||
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
|
||||
description = TextAreaField('Description', validators=[Optional()])
|
||||
|
||||
# Catalog for the Retriever
|
||||
catalog = QuerySelectField(
|
||||
'Catalog ID',
|
||||
query_factory=lambda: Catalog.query.all(),
|
||||
allow_blank=True,
|
||||
get_label='name',
|
||||
validators=[DataRequired()],
|
||||
)
|
||||
|
||||
# Select Field for Catalog Type (Uses the CATALOG_TYPES defined in config)
|
||||
type = SelectField('Processor Type', validators=[DataRequired()])
|
||||
|
||||
@@ -89,6 +80,7 @@ class ProcessorForm(FlaskForm):
|
||||
default=2000)
|
||||
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
|
||||
default=3000)
|
||||
active = BooleanField('Active', default=True)
|
||||
tuning = BooleanField('Enable Embedding Tuning', default=False)
|
||||
|
||||
# Metadata fields
|
||||
@@ -108,14 +100,6 @@ class EditProcessorForm(DynamicFormBase):
|
||||
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
|
||||
description = TextAreaField('Description', validators=[Optional()])
|
||||
|
||||
# Catalog for the Retriever
|
||||
catalog = QuerySelectField(
|
||||
'Catalog ID',
|
||||
query_factory=lambda: Catalog.query.all(),
|
||||
allow_blank=True,
|
||||
get_label='name',
|
||||
validators=[Optional()],
|
||||
)
|
||||
type = StringField('Processor Type', validators=[DataRequired()], render_kw={'readonly': True})
|
||||
|
||||
sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
|
||||
@@ -124,6 +108,7 @@ class EditProcessorForm(DynamicFormBase):
|
||||
default=2000)
|
||||
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
|
||||
default=3000)
|
||||
active = BooleanField('Active', default=True)
|
||||
tuning = BooleanField('Enable Embedding Tuning', default=False)
|
||||
|
||||
# Metadata fields
|
||||
@@ -134,14 +119,7 @@ class EditProcessorForm(DynamicFormBase):
|
||||
class RetrieverForm(FlaskForm):
|
||||
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
|
||||
description = TextAreaField('Description', validators=[Optional()])
|
||||
# Catalog for the Retriever
|
||||
catalog = QuerySelectField(
|
||||
'Catalog ID',
|
||||
query_factory=lambda: Catalog.query.all(),
|
||||
allow_blank=True,
|
||||
get_label='name',
|
||||
validators=[Optional()],
|
||||
)
|
||||
|
||||
# Select Field for Retriever Type (Uses the RETRIEVER_TYPES defined in config)
|
||||
type = SelectField('Retriever Type', validators=[DataRequired()])
|
||||
tuning = BooleanField('Enable Tuning', default=False)
|
||||
@@ -160,14 +138,7 @@ class RetrieverForm(FlaskForm):
|
||||
class EditRetrieverForm(DynamicFormBase):
|
||||
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
|
||||
description = TextAreaField('Description', validators=[Optional()])
|
||||
# Catalog for the Retriever
|
||||
catalog = QuerySelectField(
|
||||
'Catalog ID',
|
||||
query_factory=lambda: Catalog.query.all(),
|
||||
allow_blank=True,
|
||||
get_label='name',
|
||||
validators=[Optional()],
|
||||
)
|
||||
|
||||
# Select Field for Retriever Type (Uses the RETRIEVER_TYPES defined in config)
|
||||
type = StringField('Processor Type', validators=[DataRequired()], render_kw={'readonly': True})
|
||||
tuning = BooleanField('Enable Tuning', default=False)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from datetime import datetime
|
||||
from flask import request, render_template, session
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
from flask import request, render_template, session, current_app
|
||||
from sqlalchemy import desc, asc, or_, and_, cast, Integer
|
||||
from common.models.document import Document, Catalog
|
||||
from common.utils.filtered_list_view import FilteredListView
|
||||
@@ -7,31 +7,19 @@ from common.utils.view_assistants import prepare_table_for_macro
|
||||
|
||||
|
||||
class DocumentListView(FilteredListView):
|
||||
allowed_filters = ['catalog_id', 'validity']
|
||||
allowed_sorts = ['id', 'name', 'catalog_name', 'valid_from', 'valid_to']
|
||||
allowed_filters = ['validity']
|
||||
allowed_sorts = ['id', 'name', 'valid_from', 'valid_to']
|
||||
|
||||
def get_query(self):
|
||||
return Document.query.join(Catalog).add_columns(
|
||||
Document.id,
|
||||
Document.name,
|
||||
Catalog.name.label('catalog_name'),
|
||||
Document.valid_from,
|
||||
Document.valid_to
|
||||
)
|
||||
catalog_id = session.get('catalog_id')
|
||||
current_app.logger.debug(f"Catalog ID: {catalog_id}")
|
||||
return Document.query.filter_by(catalog_id=catalog_id)
|
||||
|
||||
def apply_filters(self, query):
|
||||
filters = request.args.to_dict(flat=False)
|
||||
|
||||
if 'catalog_id' in filters:
|
||||
catalog_ids = filters['catalog_id']
|
||||
if catalog_ids:
|
||||
# Convert catalog_ids to a list of integers
|
||||
catalog_ids = [int(cid) for cid in catalog_ids if cid.isdigit()]
|
||||
if catalog_ids:
|
||||
query = query.filter(Document.catalog_id.in_(catalog_ids))
|
||||
|
||||
if 'validity' in filters:
|
||||
now = datetime.utcnow().date()
|
||||
now = dt.now(tz.utc).date()
|
||||
if 'valid' in filters['validity']:
|
||||
query = query.filter(
|
||||
and_(
|
||||
@@ -47,10 +35,7 @@ class DocumentListView(FilteredListView):
|
||||
sort_order = request.args.get('sort_order', 'asc')
|
||||
|
||||
if sort_by in self.allowed_sorts:
|
||||
if sort_by == 'catalog_name':
|
||||
column = Catalog.name
|
||||
else:
|
||||
column = getattr(Document, sort_by)
|
||||
column = getattr(Document, sort_by)
|
||||
|
||||
if sort_order == 'asc':
|
||||
query = query.order_by(asc(column))
|
||||
@@ -61,42 +46,39 @@ class DocumentListView(FilteredListView):
|
||||
|
||||
def get(self):
|
||||
query = self.get_query()
|
||||
query = self.apply_filters(query)
|
||||
query = self.apply_sorting(query)
|
||||
# query = self.apply_filters(query)
|
||||
# query = self.apply_sorting(query)
|
||||
pagination = self.paginate(query)
|
||||
|
||||
def format_date(date):
|
||||
if isinstance(date, datetime):
|
||||
if isinstance(date, dt):
|
||||
return date.strftime('%Y-%m-%d')
|
||||
elif isinstance(date, str):
|
||||
return date
|
||||
else:
|
||||
return ''
|
||||
|
||||
current_app.logger.debug(f"Items retrieved: {pagination.items}")
|
||||
rows = [
|
||||
[
|
||||
{'value': item.id, 'class': '', 'type': 'text'},
|
||||
{'value': item.name, 'class': '', 'type': 'text'},
|
||||
{'value': item.catalog_name, 'class': '', 'type': 'text'},
|
||||
{'value': format_date(item.valid_from), 'class': '', 'type': 'text'},
|
||||
{'value': format_date(item.valid_to), 'class': '', 'type': 'text'}
|
||||
] for item in pagination.items
|
||||
]
|
||||
|
||||
catalogs = Catalog.query.all()
|
||||
|
||||
context = {
|
||||
'rows': rows,
|
||||
'pagination': pagination,
|
||||
'filters': request.args.to_dict(flat=False),
|
||||
'sort_by': request.args.get('sort_by', 'id'),
|
||||
'sort_order': request.args.get('sort_order', 'asc'),
|
||||
'filter_options': self.get_filter_options(catalogs)
|
||||
'filter_options': self.get_filter_options()
|
||||
}
|
||||
return render_template(self.template, **context)
|
||||
|
||||
def get_filter_options(self, catalogs):
|
||||
def get_filter_options(self):
|
||||
return {
|
||||
'catalog_id': [(str(cat.id), cat.name) for cat in catalogs],
|
||||
'validity': [('valid', 'Valid'), ('all', 'All')]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ from common.extensions import db, cache_manager, minio_client
|
||||
from common.models.interaction import Specialist, SpecialistRetriever
|
||||
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
|
||||
edit_document, \
|
||||
edit_document_version, refresh_document, clean_url
|
||||
edit_document_version, refresh_document, clean_url, is_file_type_supported_by_catalog
|
||||
from common.utils.dynamic_field_utils import create_default_config_from_type_config
|
||||
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
|
||||
EveAIDoubleURLException, EveAIException
|
||||
@@ -110,7 +110,6 @@ def handle_catalog_selection():
|
||||
current_app.logger.info(f'Setting session catalog to {catalog.name}')
|
||||
session['catalog_id'] = catalog_id
|
||||
session['catalog_name'] = catalog.name
|
||||
current_app.logger.info(f'Finished setting session catalog to {catalog.name}')
|
||||
elif action == 'edit_catalog':
|
||||
return redirect(prefixed_url_for('document_bp.edit_catalog', catalog_id=catalog_id))
|
||||
|
||||
@@ -157,7 +156,7 @@ def processor():
|
||||
tenant_id = session.get('tenant').get('id')
|
||||
new_processor = Processor()
|
||||
form.populate_obj(new_processor)
|
||||
new_processor.catalog_id = form.catalog.data.id
|
||||
new_processor.catalog_id = session.get('catalog_id')
|
||||
processor_config = cache_manager.processors_config_cache.get_config(new_processor.type)
|
||||
new_processor.configuration = create_default_config_from_type_config(
|
||||
processor_config["configuration"])
|
||||
@@ -204,9 +203,6 @@ def edit_processor(processor_id):
|
||||
form.populate_obj(processor)
|
||||
processor.configuration = form.get_dynamic_data('configuration')
|
||||
|
||||
# Update catalog relationship
|
||||
processor.catalog_id = form.catalog.data.id if form.catalog.data else None
|
||||
|
||||
# Update logging information
|
||||
update_logging_information(processor, dt.now(tz.utc))
|
||||
|
||||
@@ -235,14 +231,19 @@ def processors():
|
||||
page = request.args.get('page', 1, type=int)
|
||||
per_page = request.args.get('per_page', 10, type=int)
|
||||
|
||||
query = Processor.query.order_by(Processor.id)
|
||||
catalog_id = session.get('catalog_id', None)
|
||||
if not catalog_id:
|
||||
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
|
||||
return redirect(prefixed_url_for('document_bp.catalogs'))
|
||||
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id).order_by(Processor.id)
|
||||
|
||||
pagination = query.paginate(page=page, per_page=per_page)
|
||||
the_processors = pagination.items
|
||||
|
||||
# prepare table data
|
||||
rows = prepare_table_for_macro(the_processors,
|
||||
[('id', ''), ('name', ''), ('type', ''), ('catalog_id', '')])
|
||||
[('id', ''), ('name', ''), ('type', ''), ('active', '')])
|
||||
|
||||
# Render the catalogs in a template
|
||||
return render_template('document/processors.html', rows=rows, pagination=pagination)
|
||||
@@ -272,7 +273,7 @@ def retriever():
|
||||
tenant_id = session.get('tenant').get('id')
|
||||
new_retriever = Retriever()
|
||||
form.populate_obj(new_retriever)
|
||||
new_retriever.catalog_id = form.catalog.data.id
|
||||
new_retriever.catalog_id = session.get('catalog_id')
|
||||
new_retriever.type_version = cache_manager.retrievers_version_tree_cache.get_latest_version(
|
||||
new_retriever.type)
|
||||
|
||||
@@ -301,12 +302,6 @@ def edit_retriever(retriever_id):
|
||||
# Get the retriever or return 404
|
||||
retriever = Retriever.query.get_or_404(retriever_id)
|
||||
|
||||
if retriever.catalog_id:
|
||||
# If catalog_id is just an ID, fetch the Catalog object
|
||||
retriever.catalog = Catalog.query.get(retriever.catalog_id)
|
||||
else:
|
||||
retriever.catalog = None
|
||||
|
||||
# Create form instance with the retriever
|
||||
form = EditRetrieverForm(request.form, obj=retriever)
|
||||
|
||||
@@ -319,9 +314,6 @@ def edit_retriever(retriever_id):
|
||||
form.populate_obj(retriever)
|
||||
retriever.configuration = form.get_dynamic_data('configuration')
|
||||
|
||||
# Update catalog relationship
|
||||
retriever.catalog_id = form.catalog.data.id if form.catalog.data else None
|
||||
|
||||
# Update logging information
|
||||
update_logging_information(retriever, dt.now(tz.utc))
|
||||
|
||||
@@ -350,14 +342,19 @@ def retrievers():
|
||||
page = request.args.get('page', 1, type=int)
|
||||
per_page = request.args.get('per_page', 10, type=int)
|
||||
|
||||
query = Retriever.query.order_by(Retriever.id)
|
||||
catalog_id = session.get('catalog_id', None)
|
||||
if not catalog_id:
|
||||
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
|
||||
return redirect(prefixed_url_for('document_bp.catalogs'))
|
||||
|
||||
query = Retriever.query.filter_by(catalog_id=catalog_id).order_by(Retriever.id)
|
||||
|
||||
pagination = query.paginate(page=page, per_page=per_page)
|
||||
the_retrievers = pagination.items
|
||||
|
||||
# prepare table data
|
||||
rows = prepare_table_for_macro(the_retrievers,
|
||||
[('id', ''), ('name', ''), ('type', ''), ('catalog_id', '')])
|
||||
[('id', ''), ('name', ''), ('type', '')])
|
||||
|
||||
# Render the catalogs in a template
|
||||
return render_template('document/retrievers.html', rows=rows, pagination=pagination)
|
||||
@@ -400,6 +397,8 @@ def add_document():
|
||||
filename = secure_filename(file.filename)
|
||||
extension = filename.rsplit('.', 1)[1].lower()
|
||||
|
||||
is_file_type_supported_by_catalog(catalog_id, extension)
|
||||
|
||||
catalog_properties = form.get_dynamic_data("tagging_fields")
|
||||
|
||||
api_input = {
|
||||
@@ -451,6 +450,8 @@ def add_url():
|
||||
|
||||
file_content, filename, extension = process_url(url, tenant_id)
|
||||
|
||||
is_file_type_supported_by_catalog(catalog_id, extension)
|
||||
|
||||
catalog_properties = {}
|
||||
full_config = cache_manager.catalogs_config_cache.get_config(catalog.type)
|
||||
document_version_configurations = full_config['document_version_configurations']
|
||||
@@ -489,6 +490,11 @@ def add_url():
|
||||
@document_bp.route('/documents', methods=['GET', 'POST'])
|
||||
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
|
||||
def documents():
|
||||
catalog_id = session.get('catalog_id', None)
|
||||
if not catalog_id:
|
||||
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
|
||||
return redirect(prefixed_url_for('document_bp.catalogs'))
|
||||
|
||||
view = DocumentListView(Document, 'document/documents.html', per_page=10)
|
||||
return view.get()
|
||||
|
||||
@@ -609,7 +615,7 @@ def edit_document_version_view(document_version_id):
|
||||
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
|
||||
def document_versions(document_id):
|
||||
doc = Document.query.get_or_404(document_id)
|
||||
doc_desc = f'Document {doc.name}'
|
||||
doc_desc = f'{doc.name}'
|
||||
|
||||
page = request.args.get('page', 1, type=int)
|
||||
per_page = request.args.get('per_page', 10, type=int)
|
||||
@@ -621,8 +627,7 @@ def document_versions(document_id):
|
||||
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
|
||||
doc_langs = pagination.items
|
||||
|
||||
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('url', ''),
|
||||
('object_name', ''), ('file_type', ''),
|
||||
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('file_type', ''), ('file_size', ''),
|
||||
('processing', ''), ('processing_started_at', ''),
|
||||
('processing_finished_at', ''), ('processing_error', '')])
|
||||
|
||||
|
||||
@@ -312,7 +312,7 @@ class DynamicFormBase(FlaskForm):
|
||||
field_class = SelectField
|
||||
tenant_id = session.get('tenant').get('id')
|
||||
makes = TenantMake.query.filter_by(tenant_id=tenant_id).all()
|
||||
choices = [(make.name, make.name) for make in makes]
|
||||
choices = [(make.id, make.name) for make in makes]
|
||||
extra_classes = ''
|
||||
field_kwargs = {'choices': choices}
|
||||
|
||||
@@ -328,6 +328,16 @@ class DynamicFormBase(FlaskForm):
|
||||
initial_data: Optional initial data for the fields
|
||||
"""
|
||||
current_app.logger.debug(f"Adding dynamic fields for collection {collection_name} with config: {config}")
|
||||
|
||||
if isinstance(initial_data, str):
|
||||
try:
|
||||
initial_data = json.loads(initial_data)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
current_app.logger.error(f"Invalid JSON in initial_data: {initial_data}")
|
||||
initial_data = {}
|
||||
elif initial_data is None:
|
||||
initial_data = {}
|
||||
|
||||
# Store the full configuration for later use in get_list_type_configs_js
|
||||
if not hasattr(self, '_full_configs'):
|
||||
self._full_configs = {}
|
||||
@@ -581,7 +591,10 @@ class DynamicFormBase(FlaskForm):
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Error converting initial data to patterns: {e}")
|
||||
elif isinstance(field, DateField):
|
||||
data[original_field_name] = field.data.isoformat()
|
||||
if field.data:
|
||||
data[original_field_name] = field.data.isoformat()
|
||||
else:
|
||||
data[original_field_name] = None
|
||||
else:
|
||||
data[original_field_name] = field.data
|
||||
return data
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from flask import session
|
||||
from flask_wtf import FlaskForm
|
||||
from wtforms import (StringField, BooleanField, SelectField, TextAreaField)
|
||||
from wtforms.fields.datetime import DateField
|
||||
@@ -181,7 +182,8 @@ class EditSpecialistMagicLinkForm(DynamicFormBase):
|
||||
self.specialist_name.data = ''
|
||||
|
||||
# Dynamically populate the tenant_make field with None as first option
|
||||
tenant_makes = TenantMake.query.all()
|
||||
tenant_id = session.get('tenant').get('id')
|
||||
tenant_makes = TenantMake.query.filter_by(tenant_id=tenant_id).all()
|
||||
self.tenant_make_id.choices = [(0, 'None')] + [(make.id, make.name) for make in tenant_makes]
|
||||
|
||||
|
||||
|
||||
@@ -702,12 +702,13 @@ def specialist_magic_link():
|
||||
new_spec_ml_tenant.tenant_id = tenant_id
|
||||
|
||||
# Define the make valid for this magic link
|
||||
make_id = SpecialistServices.get_specialist_system_field(new_specialist_magic_link.specialist_id,
|
||||
"make", "tenant_make")
|
||||
specialist = Specialist.query.get(new_specialist_magic_link.specialist_id)
|
||||
make_id = specialist.configuration.get('make', None)
|
||||
current_app.logger.debug(f"make_id defined in specialist: {make_id}")
|
||||
if make_id:
|
||||
new_spec_ml_tenant.tenant_make_id = make_id
|
||||
new_specialist_magic_link.tenant_make_id = make_id
|
||||
elif session.get('tenant').get('default_tenant_make_id'):
|
||||
new_spec_ml_tenant.tenant_make_id = session.get('tenant').get('default_tenant_make_id')
|
||||
new_specialist_magic_link.tenant_make_id = session.get('tenant').get('default_tenant_make_id')
|
||||
|
||||
db.session.add(new_specialist_magic_link)
|
||||
db.session.add(new_spec_ml_tenant)
|
||||
|
||||
@@ -62,6 +62,7 @@ def edit_partner(partner_id):
|
||||
update_logging_information(partner, dt.now(tz.utc))
|
||||
db.session.commit()
|
||||
flash('Partner updated successfully.', 'success')
|
||||
refresh_session_partner(partner.id)
|
||||
return redirect(
|
||||
prefixed_url_for('partner_bp.edit_partner',
|
||||
partner_id=partner.id)) # Assuming there's a user profile view to redirect to
|
||||
@@ -197,6 +198,7 @@ def edit_partner_service(partner_service_id):
|
||||
db.session.commit()
|
||||
flash('Partner Service updated successfully.', 'success')
|
||||
current_app.logger.info(f"Partner Service {partner_service.name} updated successfully! ")
|
||||
refresh_session_partner(partner_id)
|
||||
except SQLAlchemyError as e:
|
||||
db.session.rollback()
|
||||
flash(f'Failed to update Partner Service: {str(e)}', 'danger')
|
||||
@@ -339,4 +341,7 @@ def add_partner_service_for_tenant(partner_service_id):
|
||||
return redirect(prefixed_url_for('partner_bp.partner_services'))
|
||||
|
||||
|
||||
|
||||
def refresh_session_partner(partner_id):
|
||||
if session.get('partner', None):
|
||||
if partner_id == session['partner']['id']:
|
||||
session['partner'] = Partner.query.get_or_404(partner_id).to_dict()
|
||||
|
||||
@@ -196,6 +196,14 @@ class TenantMakeForm(DynamicFormBase):
|
||||
active = BooleanField('Active', validators=[Optional()], default=True)
|
||||
website = StringField('Website', validators=[DataRequired(), Length(max=255)])
|
||||
logo_url = StringField('Logo URL', validators=[Optional(), Length(max=255)])
|
||||
allowed_languages = SelectMultipleField('Allowed Languages', choices=[], validators=[Optional()])
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TenantMakeForm, self).__init__(*args, **kwargs)
|
||||
# Initialiseer de taalopties met taalcodes en vlaggen
|
||||
lang_details = current_app.config['SUPPORTED_LANGUAGE_DETAILS']
|
||||
self.allowed_languages.choices = [(details['iso 639-1'], f"{details['flag']} {details['iso 639-1']}")
|
||||
for name, details in lang_details.items()]
|
||||
|
||||
class EditTenantMakeForm(DynamicFormBase):
|
||||
id = IntegerField('ID', widget=HiddenInput())
|
||||
@@ -204,6 +212,14 @@ class EditTenantMakeForm(DynamicFormBase):
|
||||
active = BooleanField('Active', validators=[Optional()], default=True)
|
||||
website = StringField('Website', validators=[DataRequired(), Length(max=255)])
|
||||
logo_url = StringField('Logo URL', validators=[Optional(), Length(max=255)])
|
||||
allowed_languages = SelectMultipleField('Allowed Languages', choices=[], validators=[Optional()])
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(EditTenantMakeForm, self).__init__(*args, **kwargs)
|
||||
# Initialiseer de taalopties met taalcodes en vlaggen
|
||||
lang_details = current_app.config['SUPPORTED_LANGUAGE_DETAILS']
|
||||
self.allowed_languages.choices = [(details['iso 639-1'], f"{details['flag']} {details['iso 639-1']}")
|
||||
for name, details in lang_details.items()]
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -655,6 +655,8 @@ def tenant_make():
|
||||
new_tenant_make.tenant_id = tenant_id
|
||||
customisation_options = form.get_dynamic_data("configuration")
|
||||
new_tenant_make.chat_customisation_options = json.dumps(customisation_options)
|
||||
# Verwerk allowed_languages als array
|
||||
new_tenant_make.allowed_languages = form.allowed_languages.data if form.allowed_languages.data else None
|
||||
set_logging_information(new_tenant_make, dt.now(tz.utc))
|
||||
|
||||
try:
|
||||
@@ -703,6 +705,10 @@ def edit_tenant_make(tenant_make_id):
|
||||
# Create form instance with the tenant make
|
||||
form = EditTenantMakeForm(request.form, obj=tenant_make)
|
||||
|
||||
# Initialiseer de allowed_languages selectie met huidige waarden
|
||||
if tenant_make.allowed_languages:
|
||||
form.allowed_languages.data = tenant_make.allowed_languages
|
||||
|
||||
customisation_config = cache_manager.customisations_config_cache.get_config("CHAT_CLIENT_CUSTOMISATION")
|
||||
form.add_dynamic_fields("configuration", customisation_config, tenant_make.chat_customisation_options)
|
||||
|
||||
@@ -710,6 +716,8 @@ def edit_tenant_make(tenant_make_id):
|
||||
# Update basic fields
|
||||
form.populate_obj(tenant_make)
|
||||
tenant_make.chat_customisation_options = form.get_dynamic_data("configuration")
|
||||
# Verwerk allowed_languages als array
|
||||
tenant_make.allowed_languages = form.allowed_languages.data if form.allowed_languages.data else None
|
||||
|
||||
# Update logging information
|
||||
update_logging_information(tenant_make, dt.now(tz.utc))
|
||||
|
||||
@@ -131,18 +131,20 @@ export const ChatApp = {
|
||||
const historicalMessages = chatConfig.messages || [];
|
||||
|
||||
if (historicalMessages.length > 0) {
|
||||
this.allMessages = historicalMessages.map(msg => {
|
||||
// Zorg voor een correct geformatteerde bericht-object
|
||||
return {
|
||||
id: this.messageIdCounter++,
|
||||
content: typeof msg === 'string' ? msg : msg.content || '',
|
||||
sender: msg.sender || 'ai',
|
||||
type: msg.type || 'text',
|
||||
timestamp: msg.timestamp || new Date().toISOString(),
|
||||
formData: msg.formData || null,
|
||||
status: msg.status || 'delivered'
|
||||
};
|
||||
});
|
||||
this.allMessages = historicalMessages
|
||||
.filter(msg => msg !== null && msg !== undefined) // Filter null/undefined berichten uit
|
||||
.map(msg => {
|
||||
// Zorg voor een correct geformatteerde bericht-object
|
||||
return {
|
||||
id: this.messageIdCounter++,
|
||||
content: typeof msg === 'string' ? msg : (msg.content || ''),
|
||||
sender: msg.sender || 'ai',
|
||||
type: msg.type || 'text',
|
||||
timestamp: msg.timestamp || new Date().toISOString(),
|
||||
formData: msg.formData || null,
|
||||
status: msg.status || 'delivered'
|
||||
};
|
||||
});
|
||||
|
||||
console.log(`Loaded ${this.allMessages.length} historical messages`);
|
||||
}
|
||||
|
||||
@@ -86,7 +86,13 @@ def chat(magic_link_code):
|
||||
session['chat_session_id'] = SpecialistServices.start_session()
|
||||
|
||||
# Get customisation options with defaults
|
||||
customisation = get_default_chat_customisation(tenant_make.chat_customisation_options)
|
||||
current_app.logger.debug(f"Make Customisation Options: {tenant_make.chat_customisation_options}")
|
||||
try:
|
||||
customisation = get_default_chat_customisation(tenant_make.chat_customisation_options)
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Error processing customisation options: {str(e)}")
|
||||
# Fallback to default customisation
|
||||
customisation = get_default_chat_customisation(None)
|
||||
|
||||
# Start a new chat session
|
||||
session['chat_session_id'] = SpecialistServices.start_session()
|
||||
|
||||
@@ -69,6 +69,9 @@ class SpecialistExecutor(CrewAIBaseSpecialistExecutor):
|
||||
self.role_definition_crew
|
||||
)
|
||||
|
||||
def _config_state_result_relations(self):
|
||||
pass
|
||||
|
||||
def execute(self, arguments: SpecialistArguments, formatted_context, citations) -> SpecialistResult:
|
||||
self.log_tuning("Traicie Role Definition Specialist execution started", {})
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Import all processor implementations to ensure registration
|
||||
from . import audio_processor, html_processor, pdf_processor, markdown_processor, docx_processor
|
||||
from . import audio_processor, html_processor, pdf_processor, markdown_processor, docx_processor, automagic_html_processor
|
||||
|
||||
# List of all available processor implementations
|
||||
__all__ = ['audio_processor', 'html_processor', 'pdf_processor', 'markdown_processor', 'docx_processor']
|
||||
__all__ = ['audio_processor', 'html_processor', 'pdf_processor', 'markdown_processor', 'docx_processor', 'automagic_html_processor']
|
||||
65
eveai_workers/processors/automagic_html_processor.py
Normal file
65
eveai_workers/processors/automagic_html_processor.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import io
|
||||
import pdfplumber
|
||||
from flask import current_app
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
import re
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template, get_embedding_llm, get_template
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
from .processor_registry import ProcessorRegistry
|
||||
|
||||
|
||||
class AutomagicHTMLProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, document_version, catalog, processor):
|
||||
super().__init__(tenant, document_version, catalog, processor)
|
||||
|
||||
self.chunk_size = catalog.max_chunk_size
|
||||
self.chunk_overlap = 0
|
||||
self.tuning = self.processor.tuning
|
||||
|
||||
self.prompt_params = {
|
||||
"custom_instructions": self.processor.configuration.get("custom_instructions", ""),
|
||||
}
|
||||
template, llm = get_template("automagic_html_parse")
|
||||
|
||||
translation_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
self.chain = (setup | translation_prompt | llm | output_parser)
|
||||
|
||||
|
||||
def process(self):
|
||||
self._log("Starting Automagic HTML processing")
|
||||
try:
|
||||
# Get HTML-file data
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
|
||||
# Invoke HTML Processing Agent
|
||||
self.prompt_params["html"] = file_data
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
markdown = self.chain.invoke(self.prompt_params)
|
||||
self._save_markdown(markdown)
|
||||
|
||||
# Retrieve Title
|
||||
match = re.search(r'^# (.+)', markdown, re.MULTILINE)
|
||||
title = match.group(1).strip() if match else None
|
||||
|
||||
self._log("Finished Automagic HTML Processing")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
self._log(f"Error automagically processing HTML: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("AUTOMAGIC_HTML_PROCESSOR", AutomagicHTMLProcessor)
|
||||
@@ -44,185 +44,6 @@ class PDFProcessor(BaseProcessor):
|
||||
self._log(f"Error processing PDF: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _extract_content(self, file_data):
|
||||
extracted_content = []
|
||||
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
|
||||
figure_counter = 1
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
self._log(f"Extracting content from page {page_num + 1}")
|
||||
page_content = {
|
||||
'text': page.extract_text(),
|
||||
'figures': self._extract_figures(page, page_num, figure_counter),
|
||||
'tables': self._extract_tables(page)
|
||||
}
|
||||
self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
|
||||
figure_counter += len(page_content['figures'])
|
||||
extracted_content.append(page_content)
|
||||
|
||||
return extracted_content
|
||||
|
||||
def _extract_figures(self, page, page_num, figure_counter):
|
||||
figures = []
|
||||
# Omit figure processing for now!
|
||||
# for img in page.images:
|
||||
# try:
|
||||
# # Try to get the bbox, use full page dimensions if not available
|
||||
# bbox = img.get('bbox', (0, 0, page.width, page.height))
|
||||
#
|
||||
# figure = {
|
||||
# 'figure_number': figure_counter,
|
||||
# 'filename': f"figure_{page_num + 1}_{figure_counter}.png",
|
||||
# 'caption': self._find_figure_caption(page, bbox)
|
||||
# }
|
||||
#
|
||||
# # Extract the figure as an image
|
||||
# figure_image = page.within_bbox(bbox).to_image()
|
||||
#
|
||||
# # Save the figure using MinIO
|
||||
# with io.BytesIO() as output:
|
||||
# figure_image.save(output, format='PNG')
|
||||
# output.seek(0)
|
||||
# minio_client.upload_document_file(
|
||||
# self.tenant.id,
|
||||
# self.document_version.doc_id,
|
||||
# self.document_version.language,
|
||||
# self.document_version.id,
|
||||
# figure['filename'],
|
||||
# output.getvalue()
|
||||
# )
|
||||
#
|
||||
# figures.append(figure)
|
||||
# figure_counter += 1
|
||||
# except Exception as e:
|
||||
# self._log(f"Error processing figure on page {page_num + 1}: {str(e)}", level='error')
|
||||
|
||||
return figures
|
||||
|
||||
def _find_figure_caption(self, page, bbox):
|
||||
try:
|
||||
# Look for text below the figure
|
||||
caption_bbox = (bbox[0], bbox[3], bbox[2], min(bbox[3] + 50, page.height))
|
||||
caption_text = page.crop(caption_bbox).extract_text()
|
||||
if caption_text and caption_text.lower().startswith('figure'):
|
||||
return caption_text
|
||||
except Exception as e:
|
||||
self._log(f"Error finding figure caption: {str(e)}", level='error')
|
||||
return None
|
||||
|
||||
def _extract_tables(self, page):
|
||||
tables = []
|
||||
try:
|
||||
for table in page.extract_tables():
|
||||
if table:
|
||||
markdown_table = self._table_to_markdown(table)
|
||||
if markdown_table: # Only add non-empty tables
|
||||
tables.append(markdown_table)
|
||||
self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
|
||||
except Exception as e:
|
||||
self._log(f"Error extracting tables from page: {str(e)}", level='error')
|
||||
return tables
|
||||
|
||||
def _table_to_markdown(self, table):
|
||||
if not table or not table[0]: # Check if table is empty or first row is empty
|
||||
return "" # Return empty string for empty tables
|
||||
|
||||
def clean_cell(cell):
|
||||
if cell is None:
|
||||
return "" # Convert None to empty string
|
||||
return str(cell).replace("|", "\\|") # Escape pipe characters and convert to string
|
||||
|
||||
header = [clean_cell(cell) for cell in table[0]]
|
||||
markdown = "| " + " | ".join(header) + " |\n"
|
||||
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
|
||||
|
||||
for row in table[1:]:
|
||||
cleaned_row = [clean_cell(cell) for cell in row]
|
||||
markdown += "| " + " | ".join(cleaned_row) + " |\n"
|
||||
|
||||
return markdown
|
||||
|
||||
def _structure_content(self, extracted_content):
|
||||
structured_content = ""
|
||||
title = "Untitled Document"
|
||||
current_heading_level = 0
|
||||
heading_pattern = re.compile(r'^(\d+(\.\d+)*\.?\s*)?(.+)$')
|
||||
|
||||
def identify_heading(text):
|
||||
match = heading_pattern.match(text.strip())
|
||||
if match:
|
||||
numbering, _, content = match.groups()
|
||||
if numbering:
|
||||
level = numbering.count('.') + 1
|
||||
return level, f"{numbering}{content}"
|
||||
else:
|
||||
return 1, content # Assume it's a top-level heading if no numbering
|
||||
return 0, text # Not a heading
|
||||
|
||||
for page in extracted_content:
|
||||
# Assume the title is on the first page
|
||||
if page == extracted_content[0]:
|
||||
lines = page.get('text', '').split('\n')
|
||||
if lines:
|
||||
title = lines[0].strip() # Use the first non-empty line as the title
|
||||
|
||||
# Process text
|
||||
paragraphs = page['text'].split('\n\n')
|
||||
|
||||
for para in paragraphs:
|
||||
lines = para.strip().split('\n')
|
||||
if len(lines) == 1: # Potential heading
|
||||
level, text = identify_heading(lines[0])
|
||||
if level > 0:
|
||||
heading_marks = '#' * level
|
||||
structured_content += f"\n\n{heading_marks} {text}\n\n"
|
||||
if level == 1 and not title:
|
||||
title = text # Use the first top-level heading as the title if not set
|
||||
else:
|
||||
structured_content += f"{para}\n\n" # Treat as normal paragraph
|
||||
else:
|
||||
structured_content += f"{para}\n\n" # Multi-line paragraph
|
||||
|
||||
# Process figures
|
||||
for figure in page.get('figures', []):
|
||||
structured_content += f"\n\n![Figure {figure['figure_number']}]({figure['filename']})\n\n"
|
||||
if figure['caption']:
|
||||
structured_content += f"*Figure {figure['figure_number']}: {figure['caption']}*\n\n"
|
||||
|
||||
# Add tables
|
||||
if 'tables' in page:
|
||||
for table in page['tables']:
|
||||
structured_content += f"\n{table}\n"
|
||||
|
||||
if self.tuning:
|
||||
self._save_intermediate(structured_content, "structured_content.md")
|
||||
|
||||
return structured_content, title
|
||||
|
||||
def _split_content_for_llm(self, content):
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
length_function=len,
|
||||
separators=["\n\n", "\n", " ", ""]
|
||||
)
|
||||
return text_splitter.split_text(content)
|
||||
|
||||
def _process_chunks_with_llm(self, chunks):
|
||||
template, llm = get_template('pdf_parse')
|
||||
pdf_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
chain = setup | pdf_prompt | llm | output_parser
|
||||
|
||||
markdown_chunks = []
|
||||
for chunk in chunks:
|
||||
input = {"pdf_content": chunk}
|
||||
result = chain.invoke(input)
|
||||
result = self._clean_markdown(result)
|
||||
markdown_chunks.append(result)
|
||||
|
||||
return "\n\n".join(markdown_chunks)
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)
|
||||
|
||||
@@ -11,6 +11,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
import traceback
|
||||
|
||||
from common.extensions import db, cache_manager
|
||||
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
|
||||
@@ -24,7 +25,8 @@ from common.utils.business_event_context import current_event
|
||||
from config.type_defs.processor_types import PROCESSOR_TYPES
|
||||
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
||||
|
||||
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
|
||||
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel, EveAINoContentFound, EveAIUnsupportedFileType, \
|
||||
EveAINoProcessorFound
|
||||
|
||||
from common.utils.config_field_types import json_to_pattern_list
|
||||
|
||||
@@ -58,8 +60,8 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
catalog = Catalog.query.get_or_404(catalog_id)
|
||||
|
||||
# Define processor related information
|
||||
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
|
||||
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
|
||||
processor_class = ProcessorRegistry.get_processor_class(processor.type)
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Create Embeddings request received '
|
||||
@@ -95,7 +97,7 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
delete_embeddings_for_document_version(document_version)
|
||||
|
||||
try:
|
||||
with current_event.create_span(f"{processor_type} Processing"):
|
||||
with current_event.create_span(f"{processor.type} Processing"):
|
||||
document_processor = processor_class(
|
||||
tenant=tenant,
|
||||
document_version=document_version,
|
||||
@@ -107,6 +109,8 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
'markdown': markdown,
|
||||
'title': title
|
||||
})
|
||||
if not markdown or markdown.strip() == '':
|
||||
raise EveAINoContentFound(document_version.doc_id, document_version.id)
|
||||
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, document_version, catalog, document_processor, markdown, title)
|
||||
@@ -114,9 +118,11 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
current_event.log("Finished Embedding Creation Task")
|
||||
|
||||
except Exception as e:
|
||||
stacktrace = traceback.format_exc()
|
||||
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
|
||||
f'on document version {document_version_id} '
|
||||
f'error: {e}')
|
||||
f'on document version {document_version_id} '
|
||||
f'error: {e}\n'
|
||||
f'Stacktrace: {stacktrace}')
|
||||
document_version.processing = False
|
||||
document_version.processing_finished_at = dt.now(tz.utc)
|
||||
document_version.processing_error = str(e)[:255]
|
||||
@@ -624,25 +630,9 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
|
||||
ValueError: If no matching processor is found
|
||||
"""
|
||||
try:
|
||||
current_app.logger.debug(f"Getting processor for catalog {catalog_id}, file type {file_type}, file sub_type {sub_file_type} ")
|
||||
# Start with base query for catalog
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id)
|
||||
|
||||
# Find processor type that handles this file type
|
||||
matching_processor_type = None
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
|
||||
if file_type in supported_types:
|
||||
matching_processor_type = proc_type
|
||||
break
|
||||
|
||||
if not matching_processor_type:
|
||||
raise ValueError(f"No processor type found for file type: {file_type}")
|
||||
|
||||
# Add processor type condition
|
||||
query = query.filter_by(type=matching_processor_type)
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id).filter_by(active=True)
|
||||
|
||||
# If sub_file_type is provided, add that condition
|
||||
if sub_file_type:
|
||||
@@ -651,22 +641,44 @@ def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: s
|
||||
# If no sub_file_type, prefer processors without sub_file_type specification
|
||||
query = query.filter(or_(Processor.sub_file_type.is_(None),
|
||||
Processor.sub_file_type == ''))
|
||||
|
||||
available_processors = query.all()
|
||||
|
||||
# Get the first matching processor
|
||||
processor = query.first()
|
||||
if not available_processors:
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
available_processor_types = [processor.type for processor in available_processors]
|
||||
current_app.logger.debug(f"Available processors for catalog {catalog_id}: {available_processor_types}")
|
||||
|
||||
# Find processor type that handles this file type
|
||||
matching_processor_type = None
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
# Alleen verwerken als dit type processor beschikbaar is in de database
|
||||
if proc_type in available_processor_types:
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
current_app.logger.debug(f"Supported types for processor type {proc_type}: {supported_types}")
|
||||
|
||||
if file_type in supported_types:
|
||||
matching_processor_type = proc_type
|
||||
break
|
||||
|
||||
current_app.logger.debug(f"Processor type found for catalog {catalog_id}, file type {file_type}: {matching_processor_type}")
|
||||
if not matching_processor_type:
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
else:
|
||||
current_app.logger.debug(f"Processor type found for file type: {file_type}: {matching_processor_type}")
|
||||
|
||||
processor = None
|
||||
for proc in available_processors:
|
||||
if proc.type == matching_processor_type:
|
||||
processor = proc
|
||||
break
|
||||
|
||||
if not processor:
|
||||
if sub_file_type:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
|
||||
f"file type {file_type}, sub-type {sub_file_type}"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id}, "
|
||||
f"file type {file_type}"
|
||||
)
|
||||
raise EveAINoProcessorFound(catalog_id, file_type, sub_file_type)
|
||||
|
||||
current_app.logger.debug(f"Processor found for catalog {catalog_id}, file type {file_type}: {processor}")
|
||||
return processor
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
"""Add allowed_languages to TenantMake, introduce TranslationCache
|
||||
|
||||
Revision ID: e47dc002b678
|
||||
Revises: 83d4e90f87c6
|
||||
Create Date: 2025-06-26 13:43:43.719865
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = 'e47dc002b678'
|
||||
down_revision = '83d4e90f87c6'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('translation_cache',
|
||||
sa.Column('cache_key', sa.String(length=16), nullable=False),
|
||||
sa.Column('source_text', sa.Text(), nullable=False),
|
||||
sa.Column('translated_text', sa.Text(), nullable=False),
|
||||
sa.Column('source_language', sa.String(length=2), nullable=False),
|
||||
sa.Column('target_language', sa.String(length=2), nullable=False),
|
||||
sa.Column('context', sa.Text(), nullable=True),
|
||||
sa.Column('prompt_tokens', sa.Integer(), nullable=False),
|
||||
sa.Column('completion_tokens', sa.Integer(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False),
|
||||
sa.Column('created_by', sa.Integer(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('now()'), nullable=False),
|
||||
sa.Column('updated_by', sa.Integer(), nullable=True),
|
||||
sa.Column('last_used_at', sa.DateTime(), nullable=True),
|
||||
sa.ForeignKeyConstraint(['created_by'], ['public.user.id'], ),
|
||||
sa.ForeignKeyConstraint(['updated_by'], ['public.user.id'], ),
|
||||
sa.PrimaryKeyConstraint('cache_key'),
|
||||
schema='public'
|
||||
)
|
||||
|
||||
with op.batch_alter_table('tenant_make', schema=None) as batch_op:
|
||||
batch_op.add_column(sa.Column('allowed_languages', postgresql.ARRAY(sa.String(length=2)), nullable=True))
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('tenant_make', schema=None) as batch_op:
|
||||
batch_op.drop_column('allowed_languages')
|
||||
|
||||
op.drop_table('translation_cache', schema='public')
|
||||
# ### end Alembic commands ###
|
||||
@@ -72,7 +72,8 @@ def get_public_table_names():
|
||||
# TODO: This function should include the necessary functionality to automatically retrieve table names
|
||||
return ['role', 'roles_users', 'tenant', 'user', 'tenant_domain','license_tier', 'license', 'license_usage',
|
||||
'business_event_log', 'tenant_project', 'partner', 'partner_service', 'invoice', 'license_period',
|
||||
'license_change_log', 'partner_service_license_tier', 'payment', 'partner_tenant']
|
||||
'license_change_log', 'partner_service_license_tier', 'payment', 'partner_tenant', 'tenant_make',
|
||||
'specialist_magic_link_tenant']
|
||||
|
||||
PUBLIC_TABLES = get_public_table_names()
|
||||
logger.info(f"Public tables: {PUBLIC_TABLES}")
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
"""Add Active Flag to Processor
|
||||
|
||||
Revision ID: b1647f31339a
|
||||
Revises: 2b6ae6cc923e
|
||||
Create Date: 2025-06-25 12:34:35.391516
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
import pgvector
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = 'b1647f31339a'
|
||||
down_revision = '2b6ae6cc923e'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.add_column('processor', sa.Column('active', sa.Boolean(), nullable=True))
|
||||
op.execute("UPDATE processor SET active = true")
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_column('processor', 'active')
|
||||
# ### end Alembic commands ###
|
||||
@@ -94,4 +94,5 @@ scaleway~=2.9.0
|
||||
html2text~=2025.4.15
|
||||
markdown~=3.8
|
||||
python-json-logger~=2.0.7
|
||||
qrcode[pil]==8.2
|
||||
qrcode[pil]==8.2
|
||||
xxhash~=3.5.0
|
||||
Reference in New Issue
Block a user