- Introduction of the Automatic HTML Processor

- Translation Service improvement
- Enable activation / deactivation of Processors
- Renew API-keys for Mistral (leading to workspaces)
- Align all Document views to use of a session catalog
- Allow for different processors for the same file type
This commit is contained in:
Josako
2025-06-26 14:38:40 +02:00
parent f5c9542a49
commit fda267b479
35 changed files with 551 additions and 356 deletions

View File

@@ -201,8 +201,3 @@ def register_cache_handlers(app):
register_specialist_cache_handlers(cache_manager)
from common.utils.cache.license_cache import register_license_cache_handlers
register_license_cache_handlers(cache_manager)

View File

@@ -4,13 +4,13 @@
{% block title %}Document Versions{% endblock %}
{% block content_title %}Document Versions{% endblock %}
{% block content_description %}View Versions for {{ document }}{% endblock %}
{% block content_description %}View Versions for Document <b>{{ document }}</b>{% endblock %}
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
{% block content %}
<div class="container">
<form method="POST" action="{{ url_for('document_bp.handle_document_version_selection') }}" id="documentVersionsForm">
{{ render_selectable_table(headers=["ID", "URL", "Object Name", "File Type", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
{{ render_selectable_table(headers=["ID", "File Type", "File Size", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
<div class="form-group mt-3 d-flex justify-content-between">
<div>
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary" onclick="return validateTableSelection('documentVersionsForm')">Edit Document Version</button>

View File

@@ -4,14 +4,13 @@
{% block title %}Documents{% endblock %}
{% block content_title %}Documents{% endblock %}
{% block content_description %}View Documents for Tenant{% endblock %}
{% block content_description %}View Documents for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
{% block content %}
<!-- Filter Form -->
{% set filter_form %}
<form method="GET" action="{{ url_for('document_bp.documents') }}">
{{ render_filter_field('catalog_id', 'Catalog', filter_options['catalog_id'], filters.get('catalog_id', [])) }}
{{ render_filter_field('validity', 'Validity', filter_options['validity'], filters.get('validity', [])) }}
<button type="submit" class="btn btn-primary">Apply Filters</button>
@@ -27,7 +26,6 @@
headers=[
{"text": "ID", "sort": "id"},
{"text": "Name", "sort": "name"},
{"text": "Catalog", "sort": "catalog_name"},
{"text": "Valid From", "sort": "valid_from"},
{"text": "Valid To", "sort": "valid_to"}
],

View File

@@ -4,7 +4,7 @@
{% block title %}Edit Processor{% endblock %}
{% block content_title %}Edit Processor{% endblock %}
{% block content_description %}Edit a Processor (for a Catalog){% endblock %}
{% block content_description %}Edit Processor for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
{% block content %}
<form method="post">

View File

@@ -4,7 +4,7 @@
{% block title %}Edit Retriever{% endblock %}
{% block content_title %}Edit Retriever{% endblock %}
{% block content_description %}Edit a Retriever (for a Catalog){% endblock %}
{% block content_description %}Edit a Retriever for catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
{% block content %}
<form method="post">

View File

@@ -4,7 +4,7 @@
{% block title %}Processor Registration{% endblock %}
{% block content_title %}Register Processor{% endblock %}
{% block content_description %}Define a new processor (for a catalog){% endblock %}
{% block content_description %}Define a new processor for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
{% block content %}
<form method="post">

View File

@@ -4,13 +4,13 @@
{% block title %}Processors{% endblock %}
{% block content_title %}Processors{% endblock %}
{% block content_description %}View Processors for Tenant{% endblock %}
{% block content_description %}View Processors for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
{% block content %}
<div class="container">
<form method="POST" action="{{ url_for('document_bp.handle_processor_selection') }}" id="processorsForm">
{{ render_selectable_table(headers=["Processor ID", "Name", "Type", "Catalog ID"], rows=rows, selectable=True, id="retrieversTable") }}
{{ render_selectable_table(headers=["Processor ID", "Name", "Type", "Active"], rows=rows, selectable=True, id="retrieversTable") }}
<div class="form-group mt-3 d-flex justify-content-between">
<div>
<button type="submit" name="action" value="edit_processor" class="btn btn-primary" onclick="return validateTableSelection('processorsForm')">Edit Processor</button>

View File

@@ -4,7 +4,7 @@
{% block title %}Retriever Registration{% endblock %}
{% block content_title %}Register Retriever{% endblock %}
{% block content_description %}Define a new retriever (for a catalog){% endblock %}
{% block content_description %}Define a new retriever for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
{% block content %}
<form method="post">

View File

@@ -4,13 +4,13 @@
{% block title %}Retrievers{% endblock %}
{% block content_title %}Retrievers{% endblock %}
{% block content_description %}View Retrievers for Tenant{% endblock %}
{% block content_description %}View Retrievers for Catalog <b>{% if session.catalog_name %}{{ session.catalog_name }}{% else %}No Catalog{% endif %}</b>{% endblock %}
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
{% block content %}
<div class="container">
<form method="POST" action="{{ url_for('document_bp.handle_retriever_selection') }}" id="retrieversForm">
{{ render_selectable_table(headers=["Retriever ID", "Name", "Type", "Catalog ID"], rows=rows, selectable=True, id="retrieversTable") }}
{{ render_selectable_table(headers=["Retriever ID", "Name", "Type"], rows=rows, selectable=True, id="retrieversTable") }}
<div class="form-group mt-3 d-flex justify-content-between">
<div>
<button type="submit" name="action" value="edit_retriever" class="btn btn-primary" onclick="return validateTableSelection('retrieversForm')">Edit Retriever</button>

View File

@@ -71,15 +71,6 @@ class ProcessorForm(FlaskForm):
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
description = TextAreaField('Description', validators=[Optional()])
# Catalog for the Retriever
catalog = QuerySelectField(
'Catalog ID',
query_factory=lambda: Catalog.query.all(),
allow_blank=True,
get_label='name',
validators=[DataRequired()],
)
# Select Field for Catalog Type (Uses the CATALOG_TYPES defined in config)
type = SelectField('Processor Type', validators=[DataRequired()])
@@ -89,6 +80,7 @@ class ProcessorForm(FlaskForm):
default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
default=3000)
active = BooleanField('Active', default=True)
tuning = BooleanField('Enable Embedding Tuning', default=False)
# Metadata fields
@@ -108,14 +100,6 @@ class EditProcessorForm(DynamicFormBase):
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
description = TextAreaField('Description', validators=[Optional()])
# Catalog for the Retriever
catalog = QuerySelectField(
'Catalog ID',
query_factory=lambda: Catalog.query.all(),
allow_blank=True,
get_label='name',
validators=[Optional()],
)
type = StringField('Processor Type', validators=[DataRequired()], render_kw={'readonly': True})
sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
@@ -124,6 +108,7 @@ class EditProcessorForm(DynamicFormBase):
default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
default=3000)
active = BooleanField('Active', default=True)
tuning = BooleanField('Enable Embedding Tuning', default=False)
# Metadata fields
@@ -134,14 +119,7 @@ class EditProcessorForm(DynamicFormBase):
class RetrieverForm(FlaskForm):
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
description = TextAreaField('Description', validators=[Optional()])
# Catalog for the Retriever
catalog = QuerySelectField(
'Catalog ID',
query_factory=lambda: Catalog.query.all(),
allow_blank=True,
get_label='name',
validators=[Optional()],
)
# Select Field for Retriever Type (Uses the RETRIEVER_TYPES defined in config)
type = SelectField('Retriever Type', validators=[DataRequired()])
tuning = BooleanField('Enable Tuning', default=False)
@@ -160,14 +138,7 @@ class RetrieverForm(FlaskForm):
class EditRetrieverForm(DynamicFormBase):
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
description = TextAreaField('Description', validators=[Optional()])
# Catalog for the Retriever
catalog = QuerySelectField(
'Catalog ID',
query_factory=lambda: Catalog.query.all(),
allow_blank=True,
get_label='name',
validators=[Optional()],
)
# Select Field for Retriever Type (Uses the RETRIEVER_TYPES defined in config)
type = StringField('Processor Type', validators=[DataRequired()], render_kw={'readonly': True})
tuning = BooleanField('Enable Tuning', default=False)

View File

@@ -1,5 +1,5 @@
from datetime import datetime
from flask import request, render_template, session
from datetime import datetime as dt, timezone as tz
from flask import request, render_template, session, current_app
from sqlalchemy import desc, asc, or_, and_, cast, Integer
from common.models.document import Document, Catalog
from common.utils.filtered_list_view import FilteredListView
@@ -7,31 +7,19 @@ from common.utils.view_assistants import prepare_table_for_macro
class DocumentListView(FilteredListView):
allowed_filters = ['catalog_id', 'validity']
allowed_sorts = ['id', 'name', 'catalog_name', 'valid_from', 'valid_to']
allowed_filters = ['validity']
allowed_sorts = ['id', 'name', 'valid_from', 'valid_to']
def get_query(self):
return Document.query.join(Catalog).add_columns(
Document.id,
Document.name,
Catalog.name.label('catalog_name'),
Document.valid_from,
Document.valid_to
)
catalog_id = session.get('catalog_id')
current_app.logger.debug(f"Catalog ID: {catalog_id}")
return Document.query.filter_by(catalog_id=catalog_id)
def apply_filters(self, query):
filters = request.args.to_dict(flat=False)
if 'catalog_id' in filters:
catalog_ids = filters['catalog_id']
if catalog_ids:
# Convert catalog_ids to a list of integers
catalog_ids = [int(cid) for cid in catalog_ids if cid.isdigit()]
if catalog_ids:
query = query.filter(Document.catalog_id.in_(catalog_ids))
if 'validity' in filters:
now = datetime.utcnow().date()
now = dt.now(tz.utc).date()
if 'valid' in filters['validity']:
query = query.filter(
and_(
@@ -47,10 +35,7 @@ class DocumentListView(FilteredListView):
sort_order = request.args.get('sort_order', 'asc')
if sort_by in self.allowed_sorts:
if sort_by == 'catalog_name':
column = Catalog.name
else:
column = getattr(Document, sort_by)
column = getattr(Document, sort_by)
if sort_order == 'asc':
query = query.order_by(asc(column))
@@ -61,42 +46,39 @@ class DocumentListView(FilteredListView):
def get(self):
query = self.get_query()
query = self.apply_filters(query)
query = self.apply_sorting(query)
# query = self.apply_filters(query)
# query = self.apply_sorting(query)
pagination = self.paginate(query)
def format_date(date):
if isinstance(date, datetime):
if isinstance(date, dt):
return date.strftime('%Y-%m-%d')
elif isinstance(date, str):
return date
else:
return ''
current_app.logger.debug(f"Items retrieved: {pagination.items}")
rows = [
[
{'value': item.id, 'class': '', 'type': 'text'},
{'value': item.name, 'class': '', 'type': 'text'},
{'value': item.catalog_name, 'class': '', 'type': 'text'},
{'value': format_date(item.valid_from), 'class': '', 'type': 'text'},
{'value': format_date(item.valid_to), 'class': '', 'type': 'text'}
] for item in pagination.items
]
catalogs = Catalog.query.all()
context = {
'rows': rows,
'pagination': pagination,
'filters': request.args.to_dict(flat=False),
'sort_by': request.args.get('sort_by', 'id'),
'sort_order': request.args.get('sort_order', 'asc'),
'filter_options': self.get_filter_options(catalogs)
'filter_options': self.get_filter_options()
}
return render_template(self.template, **context)
def get_filter_options(self, catalogs):
def get_filter_options(self):
return {
'catalog_id': [(str(cat.id), cat.name) for cat in catalogs],
'validity': [('valid', 'Valid'), ('all', 'All')]
}
}

View File

@@ -16,7 +16,7 @@ from common.extensions import db, cache_manager, minio_client
from common.models.interaction import Specialist, SpecialistRetriever
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
edit_document, \
edit_document_version, refresh_document, clean_url
edit_document_version, refresh_document, clean_url, is_file_type_supported_by_catalog
from common.utils.dynamic_field_utils import create_default_config_from_type_config
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
EveAIDoubleURLException, EveAIException
@@ -110,7 +110,6 @@ def handle_catalog_selection():
current_app.logger.info(f'Setting session catalog to {catalog.name}')
session['catalog_id'] = catalog_id
session['catalog_name'] = catalog.name
current_app.logger.info(f'Finished setting session catalog to {catalog.name}')
elif action == 'edit_catalog':
return redirect(prefixed_url_for('document_bp.edit_catalog', catalog_id=catalog_id))
@@ -157,7 +156,7 @@ def processor():
tenant_id = session.get('tenant').get('id')
new_processor = Processor()
form.populate_obj(new_processor)
new_processor.catalog_id = form.catalog.data.id
new_processor.catalog_id = session.get('catalog_id')
processor_config = cache_manager.processors_config_cache.get_config(new_processor.type)
new_processor.configuration = create_default_config_from_type_config(
processor_config["configuration"])
@@ -204,9 +203,6 @@ def edit_processor(processor_id):
form.populate_obj(processor)
processor.configuration = form.get_dynamic_data('configuration')
# Update catalog relationship
processor.catalog_id = form.catalog.data.id if form.catalog.data else None
# Update logging information
update_logging_information(processor, dt.now(tz.utc))
@@ -235,14 +231,19 @@ def processors():
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
query = Processor.query.order_by(Processor.id)
catalog_id = session.get('catalog_id', None)
if not catalog_id:
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
return redirect(prefixed_url_for('document_bp.catalogs'))
query = Processor.query.filter_by(catalog_id=catalog_id).order_by(Processor.id)
pagination = query.paginate(page=page, per_page=per_page)
the_processors = pagination.items
# prepare table data
rows = prepare_table_for_macro(the_processors,
[('id', ''), ('name', ''), ('type', ''), ('catalog_id', '')])
[('id', ''), ('name', ''), ('type', ''), ('active', '')])
# Render the catalogs in a template
return render_template('document/processors.html', rows=rows, pagination=pagination)
@@ -272,7 +273,7 @@ def retriever():
tenant_id = session.get('tenant').get('id')
new_retriever = Retriever()
form.populate_obj(new_retriever)
new_retriever.catalog_id = form.catalog.data.id
new_retriever.catalog_id = session.get('catalog_id')
new_retriever.type_version = cache_manager.retrievers_version_tree_cache.get_latest_version(
new_retriever.type)
@@ -301,12 +302,6 @@ def edit_retriever(retriever_id):
# Get the retriever or return 404
retriever = Retriever.query.get_or_404(retriever_id)
if retriever.catalog_id:
# If catalog_id is just an ID, fetch the Catalog object
retriever.catalog = Catalog.query.get(retriever.catalog_id)
else:
retriever.catalog = None
# Create form instance with the retriever
form = EditRetrieverForm(request.form, obj=retriever)
@@ -319,9 +314,6 @@ def edit_retriever(retriever_id):
form.populate_obj(retriever)
retriever.configuration = form.get_dynamic_data('configuration')
# Update catalog relationship
retriever.catalog_id = form.catalog.data.id if form.catalog.data else None
# Update logging information
update_logging_information(retriever, dt.now(tz.utc))
@@ -350,14 +342,19 @@ def retrievers():
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
query = Retriever.query.order_by(Retriever.id)
catalog_id = session.get('catalog_id', None)
if not catalog_id:
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
return redirect(prefixed_url_for('document_bp.catalogs'))
query = Retriever.query.filter_by(catalog_id=catalog_id).order_by(Retriever.id)
pagination = query.paginate(page=page, per_page=per_page)
the_retrievers = pagination.items
# prepare table data
rows = prepare_table_for_macro(the_retrievers,
[('id', ''), ('name', ''), ('type', ''), ('catalog_id', '')])
[('id', ''), ('name', ''), ('type', '')])
# Render the catalogs in a template
return render_template('document/retrievers.html', rows=rows, pagination=pagination)
@@ -400,6 +397,8 @@ def add_document():
filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower()
is_file_type_supported_by_catalog(catalog_id, extension)
catalog_properties = form.get_dynamic_data("tagging_fields")
api_input = {
@@ -451,6 +450,8 @@ def add_url():
file_content, filename, extension = process_url(url, tenant_id)
is_file_type_supported_by_catalog(catalog_id, extension)
catalog_properties = {}
full_config = cache_manager.catalogs_config_cache.get_config(catalog.type)
document_version_configurations = full_config['document_version_configurations']
@@ -489,6 +490,11 @@ def add_url():
@document_bp.route('/documents', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
def documents():
catalog_id = session.get('catalog_id', None)
if not catalog_id:
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
return redirect(prefixed_url_for('document_bp.catalogs'))
view = DocumentListView(Document, 'document/documents.html', per_page=10)
return view.get()
@@ -609,7 +615,7 @@ def edit_document_version_view(document_version_id):
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
def document_versions(document_id):
doc = Document.query.get_or_404(document_id)
doc_desc = f'Document {doc.name}'
doc_desc = f'{doc.name}'
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
@@ -621,8 +627,7 @@ def document_versions(document_id):
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
doc_langs = pagination.items
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('url', ''),
('object_name', ''), ('file_type', ''),
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('file_type', ''), ('file_size', ''),
('processing', ''), ('processing_started_at', ''),
('processing_finished_at', ''), ('processing_error', '')])

View File

@@ -328,6 +328,16 @@ class DynamicFormBase(FlaskForm):
initial_data: Optional initial data for the fields
"""
current_app.logger.debug(f"Adding dynamic fields for collection {collection_name} with config: {config}")
if isinstance(initial_data, str):
try:
initial_data = json.loads(initial_data)
except (json.JSONDecodeError, TypeError):
current_app.logger.error(f"Invalid JSON in initial_data: {initial_data}")
initial_data = {}
elif initial_data is None:
initial_data = {}
# Store the full configuration for later use in get_list_type_configs_js
if not hasattr(self, '_full_configs'):
self._full_configs = {}