- Introduction of the Automatic HTML Processor
- Translation Service improvement - Enable activation / deactivation of Processors - Renew API-keys for Mistral (leading to workspaces) - Align all Document views to use of a session catalog - Allow for different processors for the same file type
This commit is contained in:
@@ -71,15 +71,6 @@ class ProcessorForm(FlaskForm):
|
||||
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
|
||||
description = TextAreaField('Description', validators=[Optional()])
|
||||
|
||||
# Catalog for the Retriever
|
||||
catalog = QuerySelectField(
|
||||
'Catalog ID',
|
||||
query_factory=lambda: Catalog.query.all(),
|
||||
allow_blank=True,
|
||||
get_label='name',
|
||||
validators=[DataRequired()],
|
||||
)
|
||||
|
||||
# Select Field for Catalog Type (Uses the CATALOG_TYPES defined in config)
|
||||
type = SelectField('Processor Type', validators=[DataRequired()])
|
||||
|
||||
@@ -89,6 +80,7 @@ class ProcessorForm(FlaskForm):
|
||||
default=2000)
|
||||
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
|
||||
default=3000)
|
||||
active = BooleanField('Active', default=True)
|
||||
tuning = BooleanField('Enable Embedding Tuning', default=False)
|
||||
|
||||
# Metadata fields
|
||||
@@ -108,14 +100,6 @@ class EditProcessorForm(DynamicFormBase):
|
||||
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
|
||||
description = TextAreaField('Description', validators=[Optional()])
|
||||
|
||||
# Catalog for the Retriever
|
||||
catalog = QuerySelectField(
|
||||
'Catalog ID',
|
||||
query_factory=lambda: Catalog.query.all(),
|
||||
allow_blank=True,
|
||||
get_label='name',
|
||||
validators=[Optional()],
|
||||
)
|
||||
type = StringField('Processor Type', validators=[DataRequired()], render_kw={'readonly': True})
|
||||
|
||||
sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
|
||||
@@ -124,6 +108,7 @@ class EditProcessorForm(DynamicFormBase):
|
||||
default=2000)
|
||||
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
|
||||
default=3000)
|
||||
active = BooleanField('Active', default=True)
|
||||
tuning = BooleanField('Enable Embedding Tuning', default=False)
|
||||
|
||||
# Metadata fields
|
||||
@@ -134,14 +119,7 @@ class EditProcessorForm(DynamicFormBase):
|
||||
class RetrieverForm(FlaskForm):
|
||||
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
|
||||
description = TextAreaField('Description', validators=[Optional()])
|
||||
# Catalog for the Retriever
|
||||
catalog = QuerySelectField(
|
||||
'Catalog ID',
|
||||
query_factory=lambda: Catalog.query.all(),
|
||||
allow_blank=True,
|
||||
get_label='name',
|
||||
validators=[Optional()],
|
||||
)
|
||||
|
||||
# Select Field for Retriever Type (Uses the RETRIEVER_TYPES defined in config)
|
||||
type = SelectField('Retriever Type', validators=[DataRequired()])
|
||||
tuning = BooleanField('Enable Tuning', default=False)
|
||||
@@ -160,14 +138,7 @@ class RetrieverForm(FlaskForm):
|
||||
class EditRetrieverForm(DynamicFormBase):
|
||||
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
|
||||
description = TextAreaField('Description', validators=[Optional()])
|
||||
# Catalog for the Retriever
|
||||
catalog = QuerySelectField(
|
||||
'Catalog ID',
|
||||
query_factory=lambda: Catalog.query.all(),
|
||||
allow_blank=True,
|
||||
get_label='name',
|
||||
validators=[Optional()],
|
||||
)
|
||||
|
||||
# Select Field for Retriever Type (Uses the RETRIEVER_TYPES defined in config)
|
||||
type = StringField('Processor Type', validators=[DataRequired()], render_kw={'readonly': True})
|
||||
tuning = BooleanField('Enable Tuning', default=False)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from datetime import datetime
|
||||
from flask import request, render_template, session
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
from flask import request, render_template, session, current_app
|
||||
from sqlalchemy import desc, asc, or_, and_, cast, Integer
|
||||
from common.models.document import Document, Catalog
|
||||
from common.utils.filtered_list_view import FilteredListView
|
||||
@@ -7,31 +7,19 @@ from common.utils.view_assistants import prepare_table_for_macro
|
||||
|
||||
|
||||
class DocumentListView(FilteredListView):
|
||||
allowed_filters = ['catalog_id', 'validity']
|
||||
allowed_sorts = ['id', 'name', 'catalog_name', 'valid_from', 'valid_to']
|
||||
allowed_filters = ['validity']
|
||||
allowed_sorts = ['id', 'name', 'valid_from', 'valid_to']
|
||||
|
||||
def get_query(self):
|
||||
return Document.query.join(Catalog).add_columns(
|
||||
Document.id,
|
||||
Document.name,
|
||||
Catalog.name.label('catalog_name'),
|
||||
Document.valid_from,
|
||||
Document.valid_to
|
||||
)
|
||||
catalog_id = session.get('catalog_id')
|
||||
current_app.logger.debug(f"Catalog ID: {catalog_id}")
|
||||
return Document.query.filter_by(catalog_id=catalog_id)
|
||||
|
||||
def apply_filters(self, query):
|
||||
filters = request.args.to_dict(flat=False)
|
||||
|
||||
if 'catalog_id' in filters:
|
||||
catalog_ids = filters['catalog_id']
|
||||
if catalog_ids:
|
||||
# Convert catalog_ids to a list of integers
|
||||
catalog_ids = [int(cid) for cid in catalog_ids if cid.isdigit()]
|
||||
if catalog_ids:
|
||||
query = query.filter(Document.catalog_id.in_(catalog_ids))
|
||||
|
||||
if 'validity' in filters:
|
||||
now = datetime.utcnow().date()
|
||||
now = dt.now(tz.utc).date()
|
||||
if 'valid' in filters['validity']:
|
||||
query = query.filter(
|
||||
and_(
|
||||
@@ -47,10 +35,7 @@ class DocumentListView(FilteredListView):
|
||||
sort_order = request.args.get('sort_order', 'asc')
|
||||
|
||||
if sort_by in self.allowed_sorts:
|
||||
if sort_by == 'catalog_name':
|
||||
column = Catalog.name
|
||||
else:
|
||||
column = getattr(Document, sort_by)
|
||||
column = getattr(Document, sort_by)
|
||||
|
||||
if sort_order == 'asc':
|
||||
query = query.order_by(asc(column))
|
||||
@@ -61,42 +46,39 @@ class DocumentListView(FilteredListView):
|
||||
|
||||
def get(self):
|
||||
query = self.get_query()
|
||||
query = self.apply_filters(query)
|
||||
query = self.apply_sorting(query)
|
||||
# query = self.apply_filters(query)
|
||||
# query = self.apply_sorting(query)
|
||||
pagination = self.paginate(query)
|
||||
|
||||
def format_date(date):
|
||||
if isinstance(date, datetime):
|
||||
if isinstance(date, dt):
|
||||
return date.strftime('%Y-%m-%d')
|
||||
elif isinstance(date, str):
|
||||
return date
|
||||
else:
|
||||
return ''
|
||||
|
||||
current_app.logger.debug(f"Items retrieved: {pagination.items}")
|
||||
rows = [
|
||||
[
|
||||
{'value': item.id, 'class': '', 'type': 'text'},
|
||||
{'value': item.name, 'class': '', 'type': 'text'},
|
||||
{'value': item.catalog_name, 'class': '', 'type': 'text'},
|
||||
{'value': format_date(item.valid_from), 'class': '', 'type': 'text'},
|
||||
{'value': format_date(item.valid_to), 'class': '', 'type': 'text'}
|
||||
] for item in pagination.items
|
||||
]
|
||||
|
||||
catalogs = Catalog.query.all()
|
||||
|
||||
context = {
|
||||
'rows': rows,
|
||||
'pagination': pagination,
|
||||
'filters': request.args.to_dict(flat=False),
|
||||
'sort_by': request.args.get('sort_by', 'id'),
|
||||
'sort_order': request.args.get('sort_order', 'asc'),
|
||||
'filter_options': self.get_filter_options(catalogs)
|
||||
'filter_options': self.get_filter_options()
|
||||
}
|
||||
return render_template(self.template, **context)
|
||||
|
||||
def get_filter_options(self, catalogs):
|
||||
def get_filter_options(self):
|
||||
return {
|
||||
'catalog_id': [(str(cat.id), cat.name) for cat in catalogs],
|
||||
'validity': [('valid', 'Valid'), ('all', 'All')]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ from common.extensions import db, cache_manager, minio_client
|
||||
from common.models.interaction import Specialist, SpecialistRetriever
|
||||
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
|
||||
edit_document, \
|
||||
edit_document_version, refresh_document, clean_url
|
||||
edit_document_version, refresh_document, clean_url, is_file_type_supported_by_catalog
|
||||
from common.utils.dynamic_field_utils import create_default_config_from_type_config
|
||||
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
|
||||
EveAIDoubleURLException, EveAIException
|
||||
@@ -110,7 +110,6 @@ def handle_catalog_selection():
|
||||
current_app.logger.info(f'Setting session catalog to {catalog.name}')
|
||||
session['catalog_id'] = catalog_id
|
||||
session['catalog_name'] = catalog.name
|
||||
current_app.logger.info(f'Finished setting session catalog to {catalog.name}')
|
||||
elif action == 'edit_catalog':
|
||||
return redirect(prefixed_url_for('document_bp.edit_catalog', catalog_id=catalog_id))
|
||||
|
||||
@@ -157,7 +156,7 @@ def processor():
|
||||
tenant_id = session.get('tenant').get('id')
|
||||
new_processor = Processor()
|
||||
form.populate_obj(new_processor)
|
||||
new_processor.catalog_id = form.catalog.data.id
|
||||
new_processor.catalog_id = session.get('catalog_id')
|
||||
processor_config = cache_manager.processors_config_cache.get_config(new_processor.type)
|
||||
new_processor.configuration = create_default_config_from_type_config(
|
||||
processor_config["configuration"])
|
||||
@@ -204,9 +203,6 @@ def edit_processor(processor_id):
|
||||
form.populate_obj(processor)
|
||||
processor.configuration = form.get_dynamic_data('configuration')
|
||||
|
||||
# Update catalog relationship
|
||||
processor.catalog_id = form.catalog.data.id if form.catalog.data else None
|
||||
|
||||
# Update logging information
|
||||
update_logging_information(processor, dt.now(tz.utc))
|
||||
|
||||
@@ -235,14 +231,19 @@ def processors():
|
||||
page = request.args.get('page', 1, type=int)
|
||||
per_page = request.args.get('per_page', 10, type=int)
|
||||
|
||||
query = Processor.query.order_by(Processor.id)
|
||||
catalog_id = session.get('catalog_id', None)
|
||||
if not catalog_id:
|
||||
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
|
||||
return redirect(prefixed_url_for('document_bp.catalogs'))
|
||||
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id).order_by(Processor.id)
|
||||
|
||||
pagination = query.paginate(page=page, per_page=per_page)
|
||||
the_processors = pagination.items
|
||||
|
||||
# prepare table data
|
||||
rows = prepare_table_for_macro(the_processors,
|
||||
[('id', ''), ('name', ''), ('type', ''), ('catalog_id', '')])
|
||||
[('id', ''), ('name', ''), ('type', ''), ('active', '')])
|
||||
|
||||
# Render the catalogs in a template
|
||||
return render_template('document/processors.html', rows=rows, pagination=pagination)
|
||||
@@ -272,7 +273,7 @@ def retriever():
|
||||
tenant_id = session.get('tenant').get('id')
|
||||
new_retriever = Retriever()
|
||||
form.populate_obj(new_retriever)
|
||||
new_retriever.catalog_id = form.catalog.data.id
|
||||
new_retriever.catalog_id = session.get('catalog_id')
|
||||
new_retriever.type_version = cache_manager.retrievers_version_tree_cache.get_latest_version(
|
||||
new_retriever.type)
|
||||
|
||||
@@ -301,12 +302,6 @@ def edit_retriever(retriever_id):
|
||||
# Get the retriever or return 404
|
||||
retriever = Retriever.query.get_or_404(retriever_id)
|
||||
|
||||
if retriever.catalog_id:
|
||||
# If catalog_id is just an ID, fetch the Catalog object
|
||||
retriever.catalog = Catalog.query.get(retriever.catalog_id)
|
||||
else:
|
||||
retriever.catalog = None
|
||||
|
||||
# Create form instance with the retriever
|
||||
form = EditRetrieverForm(request.form, obj=retriever)
|
||||
|
||||
@@ -319,9 +314,6 @@ def edit_retriever(retriever_id):
|
||||
form.populate_obj(retriever)
|
||||
retriever.configuration = form.get_dynamic_data('configuration')
|
||||
|
||||
# Update catalog relationship
|
||||
retriever.catalog_id = form.catalog.data.id if form.catalog.data else None
|
||||
|
||||
# Update logging information
|
||||
update_logging_information(retriever, dt.now(tz.utc))
|
||||
|
||||
@@ -350,14 +342,19 @@ def retrievers():
|
||||
page = request.args.get('page', 1, type=int)
|
||||
per_page = request.args.get('per_page', 10, type=int)
|
||||
|
||||
query = Retriever.query.order_by(Retriever.id)
|
||||
catalog_id = session.get('catalog_id', None)
|
||||
if not catalog_id:
|
||||
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
|
||||
return redirect(prefixed_url_for('document_bp.catalogs'))
|
||||
|
||||
query = Retriever.query.filter_by(catalog_id=catalog_id).order_by(Retriever.id)
|
||||
|
||||
pagination = query.paginate(page=page, per_page=per_page)
|
||||
the_retrievers = pagination.items
|
||||
|
||||
# prepare table data
|
||||
rows = prepare_table_for_macro(the_retrievers,
|
||||
[('id', ''), ('name', ''), ('type', ''), ('catalog_id', '')])
|
||||
[('id', ''), ('name', ''), ('type', '')])
|
||||
|
||||
# Render the catalogs in a template
|
||||
return render_template('document/retrievers.html', rows=rows, pagination=pagination)
|
||||
@@ -400,6 +397,8 @@ def add_document():
|
||||
filename = secure_filename(file.filename)
|
||||
extension = filename.rsplit('.', 1)[1].lower()
|
||||
|
||||
is_file_type_supported_by_catalog(catalog_id, extension)
|
||||
|
||||
catalog_properties = form.get_dynamic_data("tagging_fields")
|
||||
|
||||
api_input = {
|
||||
@@ -451,6 +450,8 @@ def add_url():
|
||||
|
||||
file_content, filename, extension = process_url(url, tenant_id)
|
||||
|
||||
is_file_type_supported_by_catalog(catalog_id, extension)
|
||||
|
||||
catalog_properties = {}
|
||||
full_config = cache_manager.catalogs_config_cache.get_config(catalog.type)
|
||||
document_version_configurations = full_config['document_version_configurations']
|
||||
@@ -489,6 +490,11 @@ def add_url():
|
||||
@document_bp.route('/documents', methods=['GET', 'POST'])
|
||||
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
|
||||
def documents():
|
||||
catalog_id = session.get('catalog_id', None)
|
||||
if not catalog_id:
|
||||
flash('You need to set a Session Catalog before adding Documents or URLs', 'warning')
|
||||
return redirect(prefixed_url_for('document_bp.catalogs'))
|
||||
|
||||
view = DocumentListView(Document, 'document/documents.html', per_page=10)
|
||||
return view.get()
|
||||
|
||||
@@ -609,7 +615,7 @@ def edit_document_version_view(document_version_id):
|
||||
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
|
||||
def document_versions(document_id):
|
||||
doc = Document.query.get_or_404(document_id)
|
||||
doc_desc = f'Document {doc.name}'
|
||||
doc_desc = f'{doc.name}'
|
||||
|
||||
page = request.args.get('page', 1, type=int)
|
||||
per_page = request.args.get('per_page', 10, type=int)
|
||||
@@ -621,8 +627,7 @@ def document_versions(document_id):
|
||||
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
|
||||
doc_langs = pagination.items
|
||||
|
||||
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('url', ''),
|
||||
('object_name', ''), ('file_type', ''),
|
||||
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('file_type', ''), ('file_size', ''),
|
||||
('processing', ''), ('processing_started_at', ''),
|
||||
('processing_finished_at', ''), ('processing_error', '')])
|
||||
|
||||
|
||||
@@ -328,6 +328,16 @@ class DynamicFormBase(FlaskForm):
|
||||
initial_data: Optional initial data for the fields
|
||||
"""
|
||||
current_app.logger.debug(f"Adding dynamic fields for collection {collection_name} with config: {config}")
|
||||
|
||||
if isinstance(initial_data, str):
|
||||
try:
|
||||
initial_data = json.loads(initial_data)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
current_app.logger.error(f"Invalid JSON in initial_data: {initial_data}")
|
||||
initial_data = {}
|
||||
elif initial_data is None:
|
||||
initial_data = {}
|
||||
|
||||
# Store the full configuration for later use in get_list_type_configs_js
|
||||
if not hasattr(self, '_full_configs'):
|
||||
self._full_configs = {}
|
||||
|
||||
Reference in New Issue
Block a user