- Introduction of the Automatic HTML Processor

- Translation Service improvement
- Enable activation / deactivation of Processors
- Renew API-keys for Mistral (leading to workspaces)
- Align all Document views to use of a session catalog
- Allow for different processors for the same file type
This commit is contained in:
Josako
2025-06-26 14:38:40 +02:00
parent f5c9542a49
commit fda267b479
35 changed files with 551 additions and 356 deletions

View File

@@ -16,7 +16,7 @@ from common.extensions import db, cache_manager, minio_client
from common.models.interaction import Specialist, SpecialistRetriever
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
edit_document, \
edit_document_version, refresh_document, clean_url
edit_document_version, refresh_document, clean_url, is_file_type_supported_by_catalog
from common.utils.dynamic_field_utils import create_default_config_from_type_config
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
EveAIDoubleURLException, EveAIException
@@ -110,7 +110,6 @@ def handle_catalog_selection():
current_app.logger.info(f'Setting session catalog to {catalog.name}')
session['catalog_id'] = catalog_id
session['catalog_name'] = catalog.name
current_app.logger.info(f'Finished setting session catalog to {catalog.name}')
elif action == 'edit_catalog':
return redirect(prefixed_url_for('document_bp.edit_catalog', catalog_id=catalog_id))
@@ -157,7 +156,7 @@ def processor():
tenant_id = session.get('tenant').get('id')
new_processor = Processor()
form.populate_obj(new_processor)
new_processor.catalog_id = form.catalog.data.id
new_processor.catalog_id = session.get('catalog_id')
processor_config = cache_manager.processors_config_cache.get_config(new_processor.type)
new_processor.configuration = create_default_config_from_type_config(
processor_config["configuration"])
@@ -204,9 +203,6 @@ def edit_processor(processor_id):
form.populate_obj(processor)
processor.configuration = form.get_dynamic_data('configuration')
# Update catalog relationship
processor.catalog_id = form.catalog.data.id if form.catalog.data else None
# Update logging information
update_logging_information(processor, dt.now(tz.utc))
@@ -235,14 +231,19 @@ def processors():
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
query = Processor.query.order_by(Processor.id)
catalog_id = session.get('catalog_id', None)
if not catalog_id:
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
return redirect(prefixed_url_for('document_bp.catalogs'))
query = Processor.query.filter_by(catalog_id=catalog_id).order_by(Processor.id)
pagination = query.paginate(page=page, per_page=per_page)
the_processors = pagination.items
# prepare table data
rows = prepare_table_for_macro(the_processors,
[('id', ''), ('name', ''), ('type', ''), ('catalog_id', '')])
[('id', ''), ('name', ''), ('type', ''), ('active', '')])
# Render the catalogs in a template
return render_template('document/processors.html', rows=rows, pagination=pagination)
@@ -272,7 +273,7 @@ def retriever():
tenant_id = session.get('tenant').get('id')
new_retriever = Retriever()
form.populate_obj(new_retriever)
new_retriever.catalog_id = form.catalog.data.id
new_retriever.catalog_id = session.get('catalog_id')
new_retriever.type_version = cache_manager.retrievers_version_tree_cache.get_latest_version(
new_retriever.type)
@@ -301,12 +302,6 @@ def edit_retriever(retriever_id):
# Get the retriever or return 404
retriever = Retriever.query.get_or_404(retriever_id)
if retriever.catalog_id:
# If catalog_id is just an ID, fetch the Catalog object
retriever.catalog = Catalog.query.get(retriever.catalog_id)
else:
retriever.catalog = None
# Create form instance with the retriever
form = EditRetrieverForm(request.form, obj=retriever)
@@ -319,9 +314,6 @@ def edit_retriever(retriever_id):
form.populate_obj(retriever)
retriever.configuration = form.get_dynamic_data('configuration')
# Update catalog relationship
retriever.catalog_id = form.catalog.data.id if form.catalog.data else None
# Update logging information
update_logging_information(retriever, dt.now(tz.utc))
@@ -350,14 +342,19 @@ def retrievers():
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
query = Retriever.query.order_by(Retriever.id)
catalog_id = session.get('catalog_id', None)
if not catalog_id:
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
return redirect(prefixed_url_for('document_bp.catalogs'))
query = Retriever.query.filter_by(catalog_id=catalog_id).order_by(Retriever.id)
pagination = query.paginate(page=page, per_page=per_page)
the_retrievers = pagination.items
# prepare table data
rows = prepare_table_for_macro(the_retrievers,
[('id', ''), ('name', ''), ('type', ''), ('catalog_id', '')])
[('id', ''), ('name', ''), ('type', '')])
# Render the catalogs in a template
return render_template('document/retrievers.html', rows=rows, pagination=pagination)
@@ -400,6 +397,8 @@ def add_document():
filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower()
is_file_type_supported_by_catalog(catalog_id, extension)
catalog_properties = form.get_dynamic_data("tagging_fields")
api_input = {
@@ -451,6 +450,8 @@ def add_url():
file_content, filename, extension = process_url(url, tenant_id)
is_file_type_supported_by_catalog(catalog_id, extension)
catalog_properties = {}
full_config = cache_manager.catalogs_config_cache.get_config(catalog.type)
document_version_configurations = full_config['document_version_configurations']
@@ -489,6 +490,11 @@ def add_url():
@document_bp.route('/documents', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
def documents():
catalog_id = session.get('catalog_id', None)
if not catalog_id:
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
return redirect(prefixed_url_for('document_bp.catalogs'))
view = DocumentListView(Document, 'document/documents.html', per_page=10)
return view.get()
@@ -609,7 +615,7 @@ def edit_document_version_view(document_version_id):
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
def document_versions(document_id):
doc = Document.query.get_or_404(document_id)
doc_desc = f'Document {doc.name}'
doc_desc = f'{doc.name}'
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
@@ -621,8 +627,7 @@ def document_versions(document_id):
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
doc_langs = pagination.items
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('url', ''),
('object_name', ''), ('file_type', ''),
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('file_type', ''), ('file_size', ''),
('processing', ''), ('processing_started_at', ''),
('processing_finished_at', ''), ('processing_error', '')])