- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api. - Improvements on document processing - introduction of processor classes to streamline document inputs - Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
534 lines
21 KiB
Python
534 lines
21 KiB
Python
import ast
|
|
import os
|
|
from datetime import datetime as dt, timezone as tz
|
|
|
|
import chardet
|
|
from flask import request, redirect, flash, render_template, Blueprint, session, current_app
|
|
from flask_security import roles_accepted, current_user
|
|
from sqlalchemy import desc
|
|
from sqlalchemy.orm import joinedload
|
|
from werkzeug.datastructures import FileStorage
|
|
from werkzeug.utils import secure_filename
|
|
from sqlalchemy.exc import SQLAlchemyError
|
|
import requests
|
|
from requests.exceptions import SSLError
|
|
from urllib.parse import urlparse, unquote
|
|
import io
|
|
from minio.error import S3Error
|
|
|
|
from common.models.document import Document, DocumentVersion
|
|
from common.extensions import db, minio_client
|
|
from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \
|
|
process_multiple_urls, prepare_youtube_document, create_version_for_document, upload_file_for_version
|
|
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
|
|
EveAIDoubleURLException, EveAIYoutubeError
|
|
from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddYoutubeForm, \
|
|
AddURLsForm
|
|
from common.utils.middleware import mw_before_request
|
|
from common.utils.celery_utils import current_celery
|
|
from common.utils.nginx_utils import prefixed_url_for
|
|
from common.utils.view_assistants import form_validation_failed, prepare_table_for_macro, form_to_dict
|
|
from .document_version_list_view import DocumentVersionListView
|
|
|
|
document_bp = Blueprint('document_bp', __name__, url_prefix='/document')
|
|
|
|
|
|
@document_bp.before_request
|
|
def log_before_request():
|
|
current_app.logger.debug(f"Before request (document_bp): {request.method} {request.url}")
|
|
|
|
|
|
@document_bp.after_request
|
|
def log_after_request(response):
|
|
current_app.logger.debug(
|
|
f"After request (document_bp): {request.method} {request.url} - Status: {response.status}")
|
|
return response
|
|
|
|
|
|
@document_bp.before_request
|
|
def before_request():
|
|
try:
|
|
mw_before_request()
|
|
except Exception as e:
|
|
current_app.logger.error(f'Error switching schema in Document Blueprint: {e}')
|
|
for role in current_user.roles:
|
|
current_app.logger.debug(f'User {current_user.email} has role {role.name}')
|
|
raise
|
|
|
|
|
|
@document_bp.route('/add_document', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def add_document():
|
|
form = AddDocumentForm()
|
|
current_app.logger.debug('Adding document')
|
|
|
|
if form.validate_on_submit():
|
|
try:
|
|
current_app.logger.debug('Validating file type')
|
|
tenant_id = session['tenant']['id']
|
|
file = form.file.data
|
|
filename = secure_filename(file.filename)
|
|
extension = filename.rsplit('.', 1)[1].lower()
|
|
|
|
validate_file_type(extension)
|
|
|
|
api_input = {
|
|
'name': form.name.data,
|
|
'language': form.language.data,
|
|
'user_context': form.user_context.data,
|
|
'valid_from': form.valid_from.data
|
|
}
|
|
|
|
new_doc, new_doc_vers = create_document_stack(api_input, file, filename, extension, tenant_id)
|
|
task_id = start_embedding_task(tenant_id, new_doc_vers.id)
|
|
|
|
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
|
|
'success')
|
|
return redirect(prefixed_url_for('document_bp.documents'))
|
|
|
|
except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
|
|
flash(str(e), 'error')
|
|
except Exception as e:
|
|
current_app.logger.error(f'Error adding document: {str(e)}')
|
|
flash('An error occurred while adding the document.', 'error')
|
|
|
|
return render_template('document/add_document.html', form=form)
|
|
|
|
|
|
@document_bp.route('/add_url', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def add_url():
|
|
form = AddURLForm()
|
|
|
|
if form.validate_on_submit():
|
|
try:
|
|
tenant_id = session['tenant']['id']
|
|
url = form.url.data
|
|
|
|
file_content, filename, extension = process_url(url, tenant_id)
|
|
|
|
api_input = {
|
|
'name': form.name.data or filename,
|
|
'url': url,
|
|
'language': form.language.data,
|
|
'user_context': form.user_context.data,
|
|
'valid_from': form.valid_from.data
|
|
}
|
|
|
|
new_doc, new_doc_vers = create_document_stack(api_input, file_content, filename, extension, tenant_id)
|
|
task_id = start_embedding_task(tenant_id, new_doc_vers.id)
|
|
|
|
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
|
|
'success')
|
|
return redirect(prefixed_url_for('document_bp.documents'))
|
|
|
|
except EveAIDoubleURLException:
|
|
flash(f'A document with url {url} already exists. No new document created.', 'info')
|
|
except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
|
|
flash(str(e), 'error')
|
|
except Exception as e:
|
|
current_app.logger.error(f'Error adding document: {str(e)}')
|
|
flash('An error occurred while adding the document.', 'error')
|
|
|
|
return render_template('document/add_url.html', form=form)
|
|
|
|
|
|
@document_bp.route('/add_urls', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def add_urls():
|
|
form = AddURLsForm()
|
|
|
|
if form.validate_on_submit():
|
|
try:
|
|
tenant_id = session['tenant']['id']
|
|
urls = form.urls.data.split('\n')
|
|
urls = [url.strip() for url in urls if url.strip()]
|
|
|
|
api_input = {
|
|
'name': form.name.data,
|
|
'language': form.language.data,
|
|
'user_context': form.user_context.data,
|
|
'valid_from': form.valid_from.data
|
|
}
|
|
|
|
results = process_multiple_urls(urls, tenant_id, api_input)
|
|
|
|
for result in results:
|
|
if result['status'] == 'success':
|
|
flash(
|
|
f"Processed URL: {result['url']} - Document ID: {result['document_id']}, Version ID: {result['document_version_id']}",
|
|
'success')
|
|
else:
|
|
flash(f"Error processing URL: {result['url']} - {result['message']}", 'error')
|
|
|
|
return redirect(prefixed_url_for('document_bp.documents'))
|
|
|
|
except Exception as e:
|
|
current_app.logger.error(f'Error adding multiple URLs: {str(e)}')
|
|
flash('An error occurred while adding the URLs.', 'error')
|
|
|
|
return render_template('document/add_urls.html', form=form)
|
|
|
|
|
|
@document_bp.route('/add_youtube', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def add_youtube():
|
|
form = AddYoutubeForm()
|
|
|
|
if form.validate_on_submit():
|
|
try:
|
|
tenant_id = session['tenant']['id']
|
|
url = form.url.data
|
|
|
|
api_input = {
|
|
'name': form.name.data,
|
|
'language': form.language.data,
|
|
'user_context': form.user_context.data,
|
|
'valid_from': form.valid_from.data
|
|
}
|
|
|
|
new_doc, new_doc_vers = prepare_youtube_document(url, tenant_id, api_input)
|
|
task_id = start_embedding_task(tenant_id, new_doc_vers.id)
|
|
|
|
flash(
|
|
f'Processing on YouTube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
|
|
'success')
|
|
return redirect(prefixed_url_for('document_bp.documents'))
|
|
|
|
except EveAIYoutubeError as e:
|
|
flash(str(e), 'error')
|
|
except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
|
|
flash(str(e), 'error')
|
|
except Exception as e:
|
|
current_app.logger.error(f'Error adding YouTube document: {str(e)}')
|
|
flash('An error occurred while adding the YouTube document.', 'error')
|
|
|
|
return render_template('document/add_youtube.html', form=form)
|
|
|
|
|
|
@document_bp.route('/documents', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def documents():
|
|
page = request.args.get('page', 1, type=int)
|
|
per_page = request.args.get('per_page', 10, type=int)
|
|
|
|
query = Document.query.order_by(desc(Document.created_at))
|
|
|
|
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
|
|
docs = pagination.items
|
|
|
|
rows = prepare_table_for_macro(docs, [('id', ''), ('name', ''), ('valid_from', ''), ('valid_to', '')])
|
|
|
|
return render_template('document/documents.html', rows=rows, pagination=pagination)
|
|
|
|
|
|
@document_bp.route('/handle_document_selection', methods=['POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def handle_document_selection():
|
|
document_identification = request.form['selected_row']
|
|
doc_id = ast.literal_eval(document_identification).get('value')
|
|
|
|
action = request.form['action']
|
|
|
|
match action:
|
|
case 'edit_document':
|
|
return redirect(prefixed_url_for('document_bp.edit_document', document_id=doc_id))
|
|
case 'document_versions':
|
|
return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_id))
|
|
case 'refresh_document':
|
|
refresh_document(doc_id)
|
|
return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_id))
|
|
case 're_embed_latest_versions':
|
|
re_embed_latest_versions()
|
|
|
|
# Add more conditions for other actions
|
|
return redirect(prefixed_url_for('document_bp.documents'))
|
|
|
|
|
|
@document_bp.route('/edit_document/<int:document_id>', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def edit_document(document_id):
|
|
doc = Document.query.get_or_404(document_id)
|
|
form = EditDocumentForm(obj=doc)
|
|
|
|
if form.validate_on_submit():
|
|
doc.name = form.name.data
|
|
doc.valid_from = form.valid_from.data
|
|
doc.valid_to = form.valid_to.data
|
|
|
|
update_logging_information(doc, dt.now(tz.utc))
|
|
|
|
try:
|
|
db.session.add(doc)
|
|
db.session.commit()
|
|
flash(f'Document {doc.id} updated successfully', 'success')
|
|
except SQLAlchemyError as e:
|
|
db.session.rollback()
|
|
flash(f'Error updating document: {e}', 'danger')
|
|
current_app.logger.error(f'Error updating document: {e}')
|
|
else:
|
|
form_validation_failed(request, form)
|
|
|
|
return render_template('document/edit_document.html', form=form, document_id=document_id)
|
|
|
|
|
|
@document_bp.route('/edit_document_version/<int:document_version_id>', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def edit_document_version(document_version_id):
|
|
doc_vers = DocumentVersion.query.get_or_404(document_version_id)
|
|
form = EditDocumentVersionForm(obj=doc_vers)
|
|
|
|
if form.validate_on_submit():
|
|
doc_vers.user_context = form.user_context.data
|
|
|
|
update_logging_information(doc_vers, dt.now(tz.utc))
|
|
|
|
try:
|
|
db.session.add(doc_vers)
|
|
db.session.commit()
|
|
flash(f'Document Version {doc_vers.id} updated successfully', 'success')
|
|
except SQLAlchemyError as e:
|
|
db.session.rollback()
|
|
flash(f'Error updating document version: {e}', 'danger')
|
|
current_app.logger.error(f'Error updating document version {doc_vers.id} '
|
|
f'for tenant {session['tenant']['id']}: {e}')
|
|
else:
|
|
form_validation_failed(request, form)
|
|
|
|
return render_template('document/edit_document_version.html', form=form, document_version_id=document_version_id,
|
|
doc_details=f'Document {doc_vers.document.name}')
|
|
|
|
|
|
@document_bp.route('/document_versions/<int:document_id>', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def document_versions(document_id):
|
|
doc_vers = DocumentVersion.query.get_or_404(document_id)
|
|
doc_desc = f'Document {doc_vers.document.name}, Language {doc_vers.language}'
|
|
|
|
page = request.args.get('page', 1, type=int)
|
|
per_page = request.args.get('per_page', 10, type=int)
|
|
|
|
query = (DocumentVersion.query.filter_by(doc_id=document_id)
|
|
.order_by(DocumentVersion.language)
|
|
.order_by(desc(DocumentVersion.id)))
|
|
|
|
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
|
|
doc_langs = pagination.items
|
|
|
|
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('url', ''), ('file_location', ''),
|
|
('file_name', ''), ('file_type', ''),
|
|
('processing', ''), ('processing_started_at', ''),
|
|
('processing_finished_at', ''), ('processing_error', '')])
|
|
|
|
return render_template('document/document_versions.html', rows=rows, pagination=pagination, document=doc_desc)
|
|
|
|
|
|
@document_bp.route('/handle_document_version_selection', methods=['POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def handle_document_version_selection():
|
|
document_version_identification = request.form['selected_row']
|
|
doc_vers_id = ast.literal_eval(document_version_identification).get('value')
|
|
|
|
action = request.form['action']
|
|
|
|
current_app.logger.debug(f'Triggered Document Version Action: {action}')
|
|
|
|
match action:
|
|
case 'edit_document_version':
|
|
return redirect(prefixed_url_for('document_bp.edit_document_version', document_version_id=doc_vers_id))
|
|
case 'process_document_version':
|
|
process_version(doc_vers_id)
|
|
# Add more conditions for other actions
|
|
|
|
doc_vers = DocumentVersion.query.get_or_404(doc_vers_id)
|
|
return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_vers.doc_id))
|
|
|
|
|
|
@document_bp.route('/library_operations', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def library_operations():
|
|
return render_template('document/library_operations.html')
|
|
|
|
|
|
@document_bp.route('/handle_library_selection', methods=['GET', 'POST'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def handle_library_selection():
|
|
action = request.form['action']
|
|
|
|
match action:
|
|
case 're_embed_latest_versions':
|
|
re_embed_latest_versions()
|
|
case 'refresh_all_documents':
|
|
refresh_all_documents()
|
|
|
|
return redirect(prefixed_url_for('document_bp.library_operations'))
|
|
|
|
|
|
@document_bp.route('/document_versions_list', methods=['GET'])
|
|
@roles_accepted('Super User', 'Tenant Admin')
|
|
def document_versions_list():
|
|
current_app.logger.debug('Getting document versions list')
|
|
view = DocumentVersionListView(DocumentVersion, 'document/document_versions_list_view.html', per_page=20)
|
|
current_app.logger.debug('Got document versions list')
|
|
return view.get()
|
|
|
|
|
|
def refresh_all_documents():
|
|
for doc in Document.query.all():
|
|
refresh_document(doc.id)
|
|
|
|
|
|
def refresh_document(doc_id):
|
|
doc = Document.query.get_or_404(doc_id)
|
|
doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
|
|
if not doc_vers.url:
|
|
current_app.logger.info(f'Document {doc_id} has no URL, skipping refresh')
|
|
flash(f'This document has no URL. I can only refresh documents with a URL. skipping refresh', 'alert')
|
|
return
|
|
|
|
new_doc_vers = create_version_for_document(doc, doc_vers.url, doc_vers.language, doc_vers.user_context)
|
|
|
|
try:
|
|
db.session.add(new_doc_vers)
|
|
db.session.commit()
|
|
except SQLAlchemyError as e:
|
|
current_app.logger.error(f'Error refreshing document {doc_id} for tenant {session["tenant"]["id"]}: {e}')
|
|
flash('Error refreshing document.', 'alert')
|
|
db.session.rollback()
|
|
error = e.args
|
|
raise
|
|
except Exception as e:
|
|
current_app.logger.error('Unknown error')
|
|
raise
|
|
|
|
html = fetch_html(new_doc_vers.url)
|
|
file = io.BytesIO(html)
|
|
|
|
parsed_url = urlparse(new_doc_vers.url)
|
|
path_parts = parsed_url.path.split('/')
|
|
filename = path_parts[-1]
|
|
if filename == '':
|
|
filename = 'index'
|
|
if not filename.endswith('.html'):
|
|
filename += '.html'
|
|
extension = 'html'
|
|
|
|
current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
|
|
f'Document Version {new_doc_vers.id}')
|
|
|
|
upload_file_for_version(new_doc_vers, file, extension, session["tenant"]["id"])
|
|
|
|
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
|
session['tenant']['id'],
|
|
new_doc_vers.id,
|
|
])
|
|
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
|
|
f'Document Version {new_doc_vers.id}. '
|
|
f'Embedding creation task: {task.id}')
|
|
flash(f'Processing on document {doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
|
|
'success')
|
|
|
|
|
|
def re_embed_latest_versions():
|
|
docs = Document.query.all()
|
|
for doc in docs:
|
|
latest_doc_version = DocumentVersion.query.filter_by(doc_id=doc.id).order_by(desc(DocumentVersion.id)).first()
|
|
if latest_doc_version:
|
|
process_version(latest_doc_version.id)
|
|
|
|
|
|
def process_version(version_id):
|
|
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
|
session['tenant']['id'],
|
|
version_id,
|
|
])
|
|
current_app.logger.info(f'Embedding creation retriggered by user {current_user.id}, {current_user.email} '
|
|
f'for tenant {session["tenant"]["id"]}, '
|
|
f'Document Version {version_id}. '
|
|
f'Embedding creation task: {task.id}')
|
|
|
|
flash(f'Processing for document version {version_id} retriggered successfully...', 'success')
|
|
|
|
return redirect(prefixed_url_for('document_bp.documents'))
|
|
|
|
|
|
def set_logging_information(obj, timestamp):
|
|
obj.created_at = timestamp
|
|
obj.updated_at = timestamp
|
|
obj.created_by = current_user.id
|
|
obj.updated_by = current_user.id
|
|
|
|
|
|
def update_logging_information(obj, timestamp):
|
|
obj.updated_at = timestamp
|
|
obj.updated_by = current_user.id
|
|
|
|
|
|
def log_session_state(session, msg=""):
|
|
current_app.logger.debug(f"{msg} - Session dirty: {session.dirty}")
|
|
current_app.logger.debug(f"{msg} - Session new: {session.new}")
|
|
|
|
|
|
def fetch_html(url):
|
|
# Fetches HTML content from a URL
|
|
try:
|
|
response = requests.get(url)
|
|
except SSLError as e:
|
|
current_app.logger.error(f"Error fetching HTML from {url} for tenant {session['tenant']['id']}. "
|
|
f"Error Encountered: {e}")
|
|
if current_app.config.get('DEBUG'): # only allow when in a development environment
|
|
current_app.logger.info(f"Skipping SSL verification for {url} for tenant {session['tenant']['id']}. "
|
|
f"Only while in development environment.")
|
|
response = requests.get(url, verify=False) # Disable SSL verification
|
|
else:
|
|
response = None
|
|
|
|
response.raise_for_status() # Will raise an exception for bad requests
|
|
return response.content
|
|
|
|
|
|
def prepare_document_data(docs):
|
|
rows = []
|
|
for doc in docs:
|
|
doc_row = [{'value': doc.name, 'class': '', 'type': 'text'},
|
|
{'value': doc.created_at.strftime("%Y-%m-%d %H:%M:%S"), 'class': '', 'type': 'text'}]
|
|
# Document basic details
|
|
if doc.valid_from:
|
|
doc_row.append({'value': doc.valid_from.strftime("%Y-%m-%d"), 'class': '', 'type': 'text'})
|
|
else:
|
|
doc_row.append({'value': '', 'class': '', 'type': 'text'})
|
|
|
|
# Nested languages and versions
|
|
languages_rows = []
|
|
for lang in doc.languages:
|
|
lang_row = [{'value': lang.language, 'class': '', 'type': 'text'}]
|
|
|
|
# Latest version details if available (should be available ;-) )
|
|
if lang.latest_version:
|
|
lang_row.append({'value': lang.latest_version.created_at.strftime("%Y-%m-%d %H:%M:%S"),
|
|
'class': '', 'type': 'text'})
|
|
if lang.latest_version.url:
|
|
lang_row.append({'value': lang.latest_version.url,
|
|
'class': '', 'type': 'link', 'href': lang.latest_version.url})
|
|
else:
|
|
lang_row.append({'value': '', 'class': '', 'type': 'text'})
|
|
|
|
if lang.latest_version.file_name:
|
|
lang_row.append({'value': lang.latest_version.file_name, 'class': '', 'type': 'text'})
|
|
else:
|
|
lang_row.append({'value': '', 'class': '', 'type': 'text'})
|
|
|
|
if lang.latest_version.file_type:
|
|
lang_row.append({'value': lang.latest_version.file_type, 'class': '', 'type': 'text'})
|
|
else:
|
|
lang_row.append({'value': '', 'class': '', 'type': 'text'})
|
|
# Include other details as necessary
|
|
|
|
languages_rows.append(lang_row)
|
|
|
|
doc_row.append({'is_group': True, 'colspan': '5',
|
|
'headers': ['Language', 'Latest Version', 'URL', 'File Name', 'Type'],
|
|
'sub_rows': languages_rows})
|
|
rows.append(doc_row)
|
|
return rows
|