- Add Catalog Concept to Document Domain

- Create Catalog views
- Modify document stack creation
This commit is contained in:
Josako
2024-10-14 13:56:23 +02:00
parent 0f4558d775
commit 270479c77d
10 changed files with 339 additions and 102 deletions

View File

@@ -0,0 +1,23 @@
{% extends 'base.html' %}
{% from "macros.html" import render_field %}
{% block title %}Catalog Registration{% endblock %}
{% block content_title %}Register Catalog{% endblock %}
{% block content_description %}Define a new catalog of documents in Evie's Library{% endblock %}
{% block content %}
<form method="post">
{{ form.hidden_tag() }}
{% set disabled_fields = [] %}
{% set exclude_fields = [] %}
{% for field in form %}
{{ render_field(field, disabled_fields, exclude_fields) }}
{% endfor %}
<button type="submit" class="btn btn-primary">Register Catalog</button>
</form>
{% endblock %}
{% block content_footer %}
{% endblock %}

View File

@@ -0,0 +1,23 @@
{% extends 'base.html' %}
{% from 'macros.html' import render_selectable_table, render_pagination %}
{% block title %}Documents{% endblock %}
{% block content_title %}Catalogs{% endblock %}
{% block content_description %}View Catalogs for Tenant{% endblock %}
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto"></div>{% endblock %}
{% block content %}
<div class="container">
<form method="POST" action="{{ url_for('document_bp.handle_catalog_selection') }}">
{{ render_selectable_table(headers=["Catalog ID", "Name"], rows=rows, selectable=True, id="catalogsTable") }}
<div class="form-group mt-3">
<button type="submit" name="action" value="set_session_catalog" class="btn btn-primary">Set Session Catalog</button>
</div>
</form>
</div>
{% endblock %}
{% block content_footer %}
{{ render_pagination(pagination, 'document_bp.catalogs') }}
{% endblock %}

View File

@@ -0,0 +1,25 @@
{% extends 'base.html' %}
{% from "macros.html" import render_field %}
{% block title %}Edit Catalog{% endblock %}
{% block content_title %}Edit Catalog{% endblock %}
{% block content_description %}Edit a catalog of documents in Evie's Library.
When you change chunking of embedding information, you'll need to manually refresh the library if you want immediate impact.
{% endblock %}
{% block content %}
<form method="post">
{{ form.hidden_tag() }}
{% set disabled_fields = [] %}
{% set exclude_fields = [] %}
{% for field in form %}
{{ render_field(field, disabled_fields, exclude_fields) }}
{% endfor %}
<button type="submit" class="btn btn-primary">Register Catalog</button>
</form>
{% endblock %}
{% block content_footer %}
{% endblock %}

View File

@@ -1,8 +1,8 @@
from flask import session, current_app
from flask_wtf import FlaskForm
from wtforms import (StringField, BooleanField, SubmitField, DateField,
from wtforms import (StringField, BooleanField, SubmitField, DateField, IntegerField, FloatField, SelectMultipleField,
SelectField, FieldList, FormField, TextAreaField, URLField)
from wtforms.validators import DataRequired, Length, Optional, URL, ValidationError
from wtforms.validators import DataRequired, Length, Optional, URL, ValidationError, NumberRange
from flask_wtf.file import FileField, FileAllowed, FileRequired
import json
@@ -23,6 +23,36 @@ def validate_json(form, field):
raise ValidationError('Invalid JSON format')
class CatalogForm(FlaskForm):
name = StringField('Name', validators=[DataRequired(), Length(max=50)])
description = TextAreaField('Description', validators=[Optional()])
# HTML Embedding Variables
html_tags = StringField('HTML Tags', validators=[DataRequired()],
default='p, h1, h2, h3, h4, h5, h6, li, , tbody, tr, td')
html_end_tags = StringField('HTML End Tags', validators=[DataRequired()],
default='p, li')
html_included_elements = StringField('HTML Included Elements', validators=[Optional()])
html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()])
html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()])
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()],
default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
default=3000)
# Embedding Search variables
es_k = IntegerField('Limit for Searching Embeddings (5)',
default=5,
validators=[NumberRange(min=0)])
es_similarity_threshold = FloatField('Similarity Threshold for Searching Embeddings (0.5)',
default=0.5,
validators=[NumberRange(min=0, max=1)])
# Chat Variables
chat_RAG_temperature = FloatField('RAG Temperature', default=0.3, validators=[NumberRange(min=0, max=1)])
chat_no_RAG_temperature = FloatField('No RAG Temperature', default=0.5, validators=[NumberRange(min=0, max=1)])
# Tuning variables
embed_tuning = BooleanField('Enable Embedding Tuning', default=False)
rag_tuning = BooleanField('Enable RAG Tuning', default=False)
class AddDocumentForm(FlaskForm):
file = FileField('File', validators=[FileRequired(), allowed_file])
name = StringField('Name', validators=[Length(max=100)])

View File

@@ -1,6 +1,7 @@
import ast
from datetime import datetime as dt, timezone as tz
from babel.messages.setuptools_frontend import update_catalog
from flask import request, redirect, flash, render_template, Blueprint, session, current_app
from flask_security import roles_accepted, current_user
from sqlalchemy import desc
@@ -12,14 +13,15 @@ from urllib.parse import urlparse, unquote
import io
import json
from common.models.document import Document, DocumentVersion
from common.models.document import Document, DocumentVersion, Catalog
from common.extensions import db, minio_client
from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \
process_multiple_urls, get_documents_list, edit_document, \
edit_document_version, refresh_document
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
EveAIDoubleURLException
from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddURLsForm
from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddURLsForm, \
CatalogForm
from common.utils.middleware import mw_before_request
from common.utils.celery_utils import current_celery
from common.utils.nginx_utils import prefixed_url_for
@@ -52,6 +54,95 @@ def before_request():
raise
@document_bp.route('/catalog', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def catalog():
form = CatalogForm()
if form.validate_on_submit():
tenant_id = session.get('tenant').get('id')
new_catalog = Catalog()
form.populate_obj(new_catalog)
set_logging_information(new_catalog, dt.now(tz.utc))
try:
db.session.add(new_catalog)
db.session.commit()
flash('Catalog successfully added!', 'success')
current_app.logger.info(f'Catalog {new_catalog.name} successfully added for tenant {tenant_id}!')
except SQLAlchemyError as e:
db.session.rollback()
flash(f'Failed to add catalog. Error: {e}', 'danger')
current_app.logger.error(f'Failed to add catalog {new_catalog.name}'
f'for tenant {tenant_id}. Error: {str(e)}')
else:
flash('Please fill in all required fields.', 'information')
return render_template('document/catalog.html')
@document_bp.route('/catalogs', methods=['POST'])
@roles_accepted('Super User', 'Tenant Admin')
def catalogs():
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
query = Catalog.query.order_by(Catalog.id)
pagination = query.paginate(page=page, per_page=per_page)
the_catalogs = pagination.items
# prepare table data
rows = prepare_table_for_macro(the_catalogs, [('id', ''), ('name', '')])
# Render the catalogs in a template
return render_template('document/catalogs.html', rows=rows, pagination=pagination)
@document_bp.route('/handle_catalog_selection', methods=['POST'])
@roles_accepted('Super User', 'Tenant Admin')
def handle_catalog_selection():
catalog_identification = request.form.get('selected_row')
catalog_id = ast.literal_eval(catalog_identification).get('value')
action = request.form['action']
catalog = Catalog.query.get_or_404(catalog_id)
if action == 'set_session_catalog':
session['catalog_id'] = catalog_id
session['catalog_name'] = catalog.name
elif action == 'edit_catalog':
return redirect(prefixed_url_for('document_bp.edit_catalog', catalog_id=catalog_id))
return redirect(prefixed_url_for('document_bp.catalogs'))
@document_bp.route('/catalog/<int:catalog_id>', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def edit_catalog(catalog_id):
catalog = Catalog.query.get_or_404(catalog_id)
form = CatalogForm(obj=catalog)
tenant_id = session.get('tenant').get('id')
if request.method == 'POST' and form.validate_on_submit():
form.populate_obj(catalog)
update_logging_information(catalog, dt.now(tz.utc))
try:
db.session.add(catalog)
db.session.commit()
flash('Catalog successfully updated successfully!', 'success')
current_app.logger.info(f'Catalog {catalog.name} successfully updated for tenant {tenant_id}')
except SQLAlchemyError as e:
db.session.rollback()
flash(f'Failed to update catalog. Error: {e}', 'danger')
current_app.logger.error(f'Failed to update catalog {catalog_id} for tenant {tenant_id}. Error: {str(e)}')
return redirect(prefixed_url_for('document_bp.catalogs'))
else:
form_validation_failed(request, form)
return render_template('document/edit_catalog.html', form=form, catalog_id=catalog_id)
@document_bp.route('/add_document', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def add_document():
@@ -60,6 +151,7 @@ def add_document():
if form.validate_on_submit():
try:
tenant_id = session['tenant']['id']
catalog_id = session['catalog_id']
file = form.file.data
filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower()
@@ -68,6 +160,7 @@ def add_document():
current_app.logger.debug(f'Language on form: {form.language.data}')
api_input = {
'catalog_id': catalog_id,
'name': form.name.data,
'language': form.language.data,
'user_context': form.user_context.data,
@@ -100,11 +193,13 @@ def add_url():
if form.validate_on_submit():
try:
tenant_id = session['tenant']['id']
catalog_id = session['catalog_id']
url = form.url.data
file_content, filename, extension = process_url(url, tenant_id)
api_input = {
'catalog_id': catalog_id,
'name': form.name.data or filename,
'url': url,
'language': form.language.data,
@@ -397,47 +492,47 @@ def fetch_html(url):
return response.content
def prepare_document_data(docs):
rows = []
for doc in docs:
doc_row = [{'value': doc.name, 'class': '', 'type': 'text'},
{'value': doc.created_at.strftime("%Y-%m-%d %H:%M:%S"), 'class': '', 'type': 'text'}]
# Document basic details
if doc.valid_from:
doc_row.append({'value': doc.valid_from.strftime("%Y-%m-%d"), 'class': '', 'type': 'text'})
else:
doc_row.append({'value': '', 'class': '', 'type': 'text'})
# Nested languages and versions
languages_rows = []
for lang in doc.languages:
lang_row = [{'value': lang.language, 'class': '', 'type': 'text'}]
# Latest version details if available (should be available ;-) )
if lang.latest_version:
lang_row.append({'value': lang.latest_version.created_at.strftime("%Y-%m-%d %H:%M:%S"),
'class': '', 'type': 'text'})
if lang.latest_version.url:
lang_row.append({'value': lang.latest_version.url,
'class': '', 'type': 'link', 'href': lang.latest_version.url})
else:
lang_row.append({'value': '', 'class': '', 'type': 'text'})
if lang.latest_version.file_name:
lang_row.append({'value': lang.latest_version.file_name, 'class': '', 'type': 'text'})
else:
lang_row.append({'value': '', 'class': '', 'type': 'text'})
if lang.latest_version.file_type:
lang_row.append({'value': lang.latest_version.file_type, 'class': '', 'type': 'text'})
else:
lang_row.append({'value': '', 'class': '', 'type': 'text'})
# Include other details as necessary
languages_rows.append(lang_row)
doc_row.append({'is_group': True, 'colspan': '5',
'headers': ['Language', 'Latest Version', 'URL', 'File Name', 'Type'],
'sub_rows': languages_rows})
rows.append(doc_row)
return rows
# def prepare_document_data(docs):
# rows = []
# for doc in docs:
# doc_row = [{'value': doc.name, 'class': '', 'type': 'text'},
# {'value': doc.created_at.strftime("%Y-%m-%d %H:%M:%S"), 'class': '', 'type': 'text'}]
# # Document basic details
# if doc.valid_from:
# doc_row.append({'value': doc.valid_from.strftime("%Y-%m-%d"), 'class': '', 'type': 'text'})
# else:
# doc_row.append({'value': '', 'class': '', 'type': 'text'})
#
# # Nested languages and versions
# languages_rows = []
# for lang in doc.languages:
# lang_row = [{'value': lang.language, 'class': '', 'type': 'text'}]
#
# # Latest version details if available (should be available ;-) )
# if lang.latest_version:
# lang_row.append({'value': lang.latest_version.created_at.strftime("%Y-%m-%d %H:%M:%S"),
# 'class': '', 'type': 'text'})
# if lang.latest_version.url:
# lang_row.append({'value': lang.latest_version.url,
# 'class': '', 'type': 'link', 'href': lang.latest_version.url})
# else:
# lang_row.append({'value': '', 'class': '', 'type': 'text'})
#
# if lang.latest_version.object_name:
# lang_row.append({'value': lang.latest_version.object_name, 'class': '', 'type': 'text'})
# else:
# lang_row.append({'value': '', 'class': '', 'type': 'text'})
#
# if lang.latest_version.file_type:
# lang_row.append({'value': lang.latest_version.file_type, 'class': '', 'type': 'text'})
# else:
# lang_row.append({'value': '', 'class': '', 'type': 'text'})
# # Include other details as necessary
#
# languages_rows.append(lang_row)
#
# doc_row.append({'is_group': True, 'colspan': '5',
# 'headers': ['Language', 'Latest Version', 'URL', 'File Name', 'Type'],
# 'sub_rows': languages_rows})
# rows.append(doc_row)
# return rows