- Improvements on document uploads (accept other files than html-files when entering a URL)

- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api.
- Improvements on document processing - introduction of processor classes to streamline document inputs
- Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
This commit is contained in:
Josako
2024-09-02 12:37:44 +02:00
parent a158655247
commit 914c265afe
21 changed files with 1425 additions and 852 deletions

View File

@@ -27,7 +27,6 @@ def create_app(config_file=None):
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_port=1)
environment = os.getenv('FLASK_ENV', 'development')
print(environment)
match environment:
case 'development':
@@ -49,8 +48,6 @@ def create_app(config_file=None):
logger = logging.getLogger(__name__)
logger.info("eveai_app starting up")
logger.debug("start config")
logger.debug(app.config)
# Register extensions
@@ -95,14 +92,11 @@ def create_app(config_file=None):
}
return jsonify(response), 500
@app.before_request
def before_request():
# app.logger.debug(f"Before request - Session ID: {session.sid}")
app.logger.debug(f"Before request - Session data: {session}")
app.logger.debug(f"Before request - Request headers: {request.headers}")
# Register API
register_api(app)
# @app.before_request
# def before_request():
# # app.logger.debug(f"Before request - Session ID: {session.sid}")
# app.logger.debug(f"Before request - Session data: {session}")
# app.logger.debug(f"Before request - Request headers: {request.headers}")
# Register template filters
register_filters(app)
@@ -138,9 +132,3 @@ def register_blueprints(app):
app.register_blueprint(security_bp)
from .views.interaction_views import interaction_bp
app.register_blueprint(interaction_bp)
def register_api(app):
pass
# from . import api
# app.register_blueprint(api.bp, url_prefix='/api')

View File

@@ -84,7 +84,6 @@
{'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'Add a list of URLs', 'url': '/document/add_urls', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'Add Youtube Document' , 'url': '/document/add_youtube', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'All Document Versions', 'url': '/document/document_versions_list', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'Library Operations', 'url': '/document/library_operations', 'roles': ['Super User', 'Tenant Admin']},

View File

@@ -1,14 +1,21 @@
from flask import session
from flask import session, current_app
from flask_wtf import FlaskForm
from wtforms import (StringField, BooleanField, SubmitField, DateField,
SelectField, FieldList, FormField, TextAreaField, URLField)
from wtforms.validators import DataRequired, Length, Optional, URL
from wtforms.validators import DataRequired, Length, Optional, URL, ValidationError
from flask_wtf.file import FileField, FileAllowed, FileRequired
def allowed_file(form, field):
if field.data:
filename = field.data.filename
allowed_extensions = current_app.config.get('SUPPORTED_FILE_TYPES', [])
if not ('.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions):
raise ValidationError('Unsupported file type.')
class AddDocumentForm(FlaskForm):
file = FileField('File', validators=[FileAllowed(['pdf', 'txt', 'html']),
FileRequired()])
file = FileField('File', validators=[FileRequired(), allowed_file])
name = StringField('Name', validators=[Length(max=100)])
language = SelectField('Language', choices=[], validators=[Optional()])
user_context = TextAreaField('User Context', validators=[Optional()])

View File

@@ -18,6 +18,10 @@ from minio.error import S3Error
from common.models.document import Document, DocumentVersion
from common.extensions import db, minio_client
from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \
process_multiple_urls, prepare_youtube_document, create_version_for_document, upload_file_for_version
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
EveAIDoubleURLException, EveAIYoutubeError
from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddYoutubeForm, \
AddURLsForm
from common.utils.middleware import mw_before_request
@@ -56,30 +60,37 @@ def before_request():
@roles_accepted('Super User', 'Tenant Admin')
def add_document():
form = AddDocumentForm()
current_app.logger.debug('Adding document')
# If the form is submitted
if form.validate_on_submit():
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
file = form.file.data
filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower()
form_dict = form_to_dict(form)
try:
current_app.logger.debug('Validating file type')
tenant_id = session['tenant']['id']
file = form.file.data
filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower()
new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
validate_file_type(extension)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
])
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}')
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
'success')
api_input = {
'name': form.name.data,
'language': form.language.data,
'user_context': form.user_context.data,
'valid_from': form.valid_from.data
}
return redirect(prefixed_url_for('document_bp.documents'))
else:
form_validation_failed(request, form)
new_doc, new_doc_vers = create_document_stack(api_input, file, filename, extension, tenant_id)
task_id = start_embedding_task(tenant_id, new_doc_vers.id)
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
'success')
return redirect(prefixed_url_for('document_bp.documents'))
except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
flash(str(e), 'error')
except Exception as e:
current_app.logger.error(f'Error adding document: {str(e)}')
flash('An error occurred while adding the document.', 'error')
return render_template('document/add_document.html', form=form)
@@ -90,189 +101,107 @@ def add_url():
form = AddURLForm()
if form.validate_on_submit():
current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}')
url = form.url.data
try:
response = requests.head(url, allow_redirects=True)
content_type = response.headers.get('Content-Type', '').split(';')[0]
tenant_id = session['tenant']['id']
url = form.url.data
# Determine file extension based on Content-Type
extension = get_extension_from_content_type(content_type)
file_content, filename, extension = process_url(url, tenant_id)
# Generate filename
parsed_url = urlparse(url)
path = unquote(parsed_url.path)
filename = os.path.basename(path)
api_input = {
'name': form.name.data or filename,
'url': url,
'language': form.language.data,
'user_context': form.user_context.data,
'valid_from': form.valid_from.data
}
if not filename or '.' not in filename:
# Use the last part of the path or a default name
filename = path.strip('/').split('/')[-1] or 'document'
filename = secure_filename(f"{filename}.{extension}")
else:
filename = secure_filename(filename)
new_doc, new_doc_vers = create_document_stack(api_input, file_content, filename, extension, tenant_id)
task_id = start_embedding_task(tenant_id, new_doc_vers.id)
# Check if a document with this URL already exists
existing_doc = DocumentVersion.query.filter_by(url=url).first()
if existing_doc:
flash(f'A document with URL {url} already exists. No new document created.', 'info')
return redirect(prefixed_url_for('document_bp.documents'))
# Download the content
response = requests.get(url)
response.raise_for_status()
file_content = response.content
# Create document and document version
form_dict = form_to_dict(form)
new_doc, new_doc_vers = create_document_stack(form_dict, file_content, filename, extension)
# Upload file to storage
minio_client.upload_document_file(
session['tenant']['id'],
new_doc_vers.doc_id,
new_doc_vers.language,
new_doc_vers.id,
filename,
file_content
)
# Start embedding task
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
])
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}')
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
'success')
return redirect(prefixed_url_for('document_bp.documents'))
except requests.RequestException as e:
current_app.logger.error(f'Error fetching URL {url}: {str(e)}')
flash(f'Error fetching URL: {str(e)}', 'danger')
except SQLAlchemyError as e:
current_app.logger.error(f'Database error: {str(e)}')
flash('An error occurred while saving the document.', 'danger')
except EveAIDoubleURLException:
flash(f'A document with url {url} already exists. No new document created.', 'info')
except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
flash(str(e), 'error')
except Exception as e:
current_app.logger.error(f'Unexpected error: {str(e)}')
flash('An unexpected error occurred.', 'danger')
current_app.logger.error(f'Error adding document: {str(e)}')
flash('An error occurred while adding the document.', 'error')
return render_template('document/add_url.html', form=form)
def get_extension_from_content_type(content_type):
content_type_map = {
'text/html': 'html',
'application/pdf': 'pdf',
'text/plain': 'txt',
'application/msword': 'doc',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
# Add more mappings as needed
}
return content_type_map.get(content_type, 'html') # Default to 'html' if unknown
@document_bp.route('/add_urls', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def add_urls():
form = AddURLsForm()
if form.validate_on_submit():
urls = form.urls.data.split('\n')
urls = [url.strip() for url in urls if url.strip()]
try:
tenant_id = session['tenant']['id']
urls = form.urls.data.split('\n')
urls = [url.strip() for url in urls if url.strip()]
for i, url in enumerate(urls):
try:
doc_vers = DocumentVersion.query.filter_by(url=url).all()
if doc_vers:
current_app.logger.info(f'A document with url {url} already exists. No new document created.')
flash(f'A document with url {url} already exists. No new document created.', 'info')
continue
api_input = {
'name': form.name.data,
'language': form.language.data,
'user_context': form.user_context.data,
'valid_from': form.valid_from.data
}
html = fetch_html(url)
file = io.BytesIO(html)
results = process_multiple_urls(urls, tenant_id, api_input)
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')
filename = path_parts[-1] if path_parts[-1] else 'index'
if not filename.endswith('.html'):
filename += '.html'
for result in results:
if result['status'] == 'success':
flash(
f"Processed URL: {result['url']} - Document ID: {result['document_id']}, Version ID: {result['document_version_id']}",
'success')
else:
flash(f"Error processing URL: {result['url']} - {result['message']}", 'error')
# Use the name prefix if provided, otherwise use the filename
doc_name = f"{form.name.data}-{filename}" if form.name.data else filename
return redirect(prefixed_url_for('document_bp.documents'))
new_doc, new_doc_vers = create_document_stack({
'name': doc_name,
'url': url,
'language': form.language.data,
'user_context': form.user_context.data,
'valid_from': form.valid_from.data
}, file, filename, 'html')
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
])
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}')
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
'success')
except Exception as e:
current_app.logger.error(f"Error processing URL {url}: {str(e)}")
flash(f'Error processing URL {url}: {str(e)}', 'danger')
return redirect(prefixed_url_for('document_bp.documents'))
else:
form_validation_failed(request, form)
except Exception as e:
current_app.logger.error(f'Error adding multiple URLs: {str(e)}')
flash('An error occurred while adding the URLs.', 'error')
return render_template('document/add_urls.html', form=form)
@document_bp.route('/add_youtube', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def add_youtube():
form = AddYoutubeForm()
if form.validate_on_submit():
current_app.logger.info(f'Adding Youtube document for tenant {session["tenant"]["id"]}')
url = form.url.data
current_app.logger.debug(f'Value of language field: {form.language.data}')
try:
tenant_id = session['tenant']['id']
url = form.url.data
doc_vers = DocumentVersion.query.filter_by(url=url).all()
if doc_vers:
current_app.logger.info(f'A document with url {url} already exists. No new document created.')
flash(f'A document with url {url} already exists. No new document created.', 'info')
api_input = {
'name': form.name.data,
'language': form.language.data,
'user_context': form.user_context.data,
'valid_from': form.valid_from.data
}
new_doc, new_doc_vers = prepare_youtube_document(url, tenant_id, api_input)
task_id = start_embedding_task(tenant_id, new_doc_vers.id)
flash(
f'Processing on YouTube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
'success')
return redirect(prefixed_url_for('document_bp.documents'))
# As downloading a Youtube document can take quite some time, we offload this downloading to the worker
# We just pass a simple file to get things conform
file = "Youtube placeholder file"
filename = 'placeholder.youtube'
extension = 'youtube'
form_dict = form_to_dict(form)
current_app.logger.debug(f'Form data: {form_dict}')
new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
])
current_app.logger.info(f'Processing and Embedding on Youtube document started for tenant '
f'{session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. '
f'Processing and Embedding Youtube task: {task.id}')
flash(f'Processing on Youtube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
'success')
return redirect(prefixed_url_for('document_bp.documents'))
else:
form_validation_failed(request, form)
except EveAIYoutubeError as e:
flash(str(e), 'error')
except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
flash(str(e), 'error')
except Exception as e:
current_app.logger.error(f'Error adding YouTube document: {str(e)}')
flash('An error occurred while adding the YouTube document.', 'error')
return render_template('document/add_youtube.html', form=form)
@@ -487,7 +416,7 @@ def refresh_document(doc_id):
current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}')
upload_file_for_version(new_doc_vers, file, extension)
upload_file_for_version(new_doc_vers, file, extension, session["tenant"]["id"])
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
@@ -535,116 +464,11 @@ def update_logging_information(obj, timestamp):
obj.updated_by = current_user.id
def create_document_stack(form, file, filename, extension):
# Create the Document
new_doc = create_document(form, filename)
# Create the DocumentVersion
new_doc_vers = create_version_for_document(new_doc,
form.get('url', ''),
form.get('language', 'en'),
form.get('user_context', '')
)
try:
db.session.add(new_doc)
db.session.add(new_doc_vers)
db.session.commit()
except SQLAlchemyError as e:
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
flash('Error adding document.', 'alert')
db.session.rollback()
error = e.args
raise
except Exception as e:
current_app.logger.error('Unknown error')
raise
current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc.id}')
upload_file_for_version(new_doc_vers, file, extension)
return new_doc, new_doc_vers
def log_session_state(session, msg=""):
current_app.logger.debug(f"{msg} - Session dirty: {session.dirty}")
current_app.logger.debug(f"{msg} - Session new: {session.new}")
def create_document(form, filename):
new_doc = Document()
if form['name'] == '':
new_doc.name = filename.rsplit('.', 1)[0]
else:
new_doc.name = form['name']
if form['valid_from'] and form['valid_from'] != '':
new_doc.valid_from = form['valid_from']
else:
new_doc.valid_from = dt.now(tz.utc)
new_doc.tenant_id = session['tenant']['id']
set_logging_information(new_doc, dt.now(tz.utc))
return new_doc
def create_version_for_document(document, url, language, user_context):
new_doc_vers = DocumentVersion()
if url != '':
new_doc_vers.url = url
if language == '':
new_doc_vers.language = session['default_language']
else:
new_doc_vers.language = language
if user_context != '':
new_doc_vers.user_context = user_context
new_doc_vers.document = document
set_logging_information(new_doc_vers, dt.now(tz.utc))
return new_doc_vers
def upload_file_for_version(doc_vers, file, extension):
doc_vers.file_type = extension
doc_vers.file_name = doc_vers.calc_file_name()
doc_vers.file_location = doc_vers.calc_file_location()
# Normally, the tenant bucket should exist. But let's be on the safe side if a migration took place.
tenant_id = session['tenant']['id']
minio_client.create_tenant_bucket(tenant_id)
try:
minio_client.upload_document_file(
tenant_id,
doc_vers.doc_id,
doc_vers.language,
doc_vers.id,
doc_vers.file_name,
file
)
db.session.commit()
current_app.logger.info(f'Successfully saved document to MinIO for tenant {tenant_id} for '
f'document version {doc_vers.id} while uploading file.')
except S3Error as e:
db.session.rollback()
flash('Error saving document to MinIO.', 'error')
current_app.logger.error(
f'Error saving document to MinIO for tenant {tenant_id}: {e}')
raise
except SQLAlchemyError as e:
db.session.rollback()
flash('Error saving document metadata.', 'error')
current_app.logger.error(
f'Error saving document metadata for tenant {tenant_id}: {e}')
raise
def fetch_html(url):
# Fetches HTML content from a URL
try: