Add functionality to add a URL to the system.

This commit is contained in:
Josako
2024-05-10 22:44:53 +02:00
parent a4bf837d67
commit 699de951e8
6 changed files with 213 additions and 74 deletions

View File

@@ -0,0 +1,24 @@
{% extends 'base.html' %}
{% from "macros.html" import render_field %}
{% block title %}Add URL{% endblock %}
{% block content_title %}Add URL{% endblock %}
{% block content_description %}Add a url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %}
{% block content %}
<form method="post" enctype="multipart/form-data">
{{ form.hidden_tag() }}
{% set disabled_fields = [] %}
{% set exclude_fields = [] %}
{% for field in form %}
{{ render_field(field, disabled_fields, exclude_fields) }}
{% endfor %}
<button type="submit" class="btn btn-primary">Add URL</button>
</form>
{% endblock %}
{% block content_footer %}
{% endblock %}

View File

@@ -78,6 +78,7 @@
{% if current_user.is_authenticated %}
{{ dropdown('Document Mgmt', 'contacts', [
{'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']},
]) }}
{% endif %}

View File

@@ -1,13 +1,13 @@
from flask import session
from flask_wtf import FlaskForm
from wtforms import (StringField, BooleanField, SubmitField, DateField,
SelectField, FieldList, FormField, TextAreaField)
from wtforms.validators import DataRequired, Length, Optional
SelectField, FieldList, FormField, TextAreaField, URLField)
from wtforms.validators import DataRequired, Length, Optional, URL
from flask_wtf.file import FileField, FileAllowed, FileRequired
class AddDocumentForm(FlaskForm):
file = FileField('File', validators=[FileAllowed(['pdf', 'txt']),
file = FileField('File', validators=[FileAllowed(['pdf', 'txt', 'html']),
FileRequired()])
name = StringField('Name', validators=[Length(max=100)])
language = SelectField('Language', choices=[], validators=[Optional()])
@@ -27,4 +27,21 @@ class AddDocumentForm(FlaskForm):
self.doc_embedding_model.data = session.get('default_embedding_model')
class AddURLForm(FlaskForm):
url = URLField('URL', validators=[DataRequired(), URL()])
name = StringField('Name', validators=[Length(max=100)])
language = SelectField('Language', choices=[], validators=[Optional()])
user_context = TextAreaField('User Context', validators=[Optional()])
valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()])
doc_embedding_model = SelectField('Default Embedding Model', choices=[], validators=[DataRequired()])
submit = SubmitField('Submit')
def __init__(self):
super().__init__()
self.language.choices = [(language, language) for language in
session.get('tenant').get('allowed_languages')]
self.language.data = session.get('default_language')
self.doc_embedding_model.choices = [(model, model) for model in
session.get('tenant').get('allowed_embedding_models')]
self.doc_embedding_model.data = session.get('default_embedding_model')

View File

@@ -4,15 +4,20 @@ from flask import request, redirect, url_for, flash, render_template, Blueprint,
from flask_security import roles_accepted, current_user
from sqlalchemy import desc
from sqlalchemy.orm import joinedload
from werkzeug.datastructures import FileStorage
from werkzeug.utils import secure_filename
from sqlalchemy.exc import SQLAlchemyError
import requests
from requests.exceptions import SSLError
from urllib.parse import urlparse
import io
from common.models.document import Document, DocumentLanguage, DocumentVersion
from common.extensions import db
from .document_forms import AddDocumentForm
from .document_forms import AddDocumentForm, AddURLForm
from common.utils.middleware import mw_before_request
from common.utils.celery_utils import current_celery
document_bp = Blueprint('document_bp', __name__, url_prefix='/document')
@@ -25,69 +30,49 @@ def before_request():
@roles_accepted('Super User', 'Tenant Admin')
def add_document():
form = AddDocumentForm()
error = None
# If the form is submitted
if request.method == 'POST' and form.validate_on_submit():
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
file = form.file.data
filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower()
# Create the Document
new_doc = Document()
if form.name.data == '':
new_doc.name = filename.rsplit('.', 1)[0]
else:
new_doc.name = form.name.data
create_document_stack(form, file, filename, extension)
if form.valid_from.data or form.valid_from.data != '':
new_doc.valid_from = form.valid_from.data
else:
new_doc.valid_from = dt.now(tz.utc)
new_doc.tenant_id = session['tenant']['id']
set_logging_information(new_doc, dt.now(tz.utc))
# Create the DocumentLanguage
new_doc_lang = create_language_for_document(new_doc, form.language.data, form.user_context.data)
# Create the DocumentVersion
new_doc_vers = DocumentVersion()
new_doc_vers.document_language = new_doc_lang
set_logging_information(new_doc_vers, dt.now(tz.utc))
try:
db.session.add(new_doc)
db.session.add(new_doc_lang)
db.session.add(new_doc_vers)
db.session.commit()
new_doc_lang.latest_version = new_doc_vers
db.session.commit()
except Exception as e:
db.session.rollback()
error = e.args
# Save the file and process the document
if error is None:
flash('Document added successfully.', 'success')
current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc.id}')
upload_file_for_version(new_doc_vers, file, extension)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
session['default_embedding_model'],
])
current_app.logger.info(f'Document processing started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}, '
f'Task ID {task.id}')
print('Processing should start soon')
else:
flash('Error adding document.', 'error')
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {error}')
return redirect(url_for('document_bp/documents'))
return render_template('document/add_document.html', form=form)
@document_bp.route('/add_url', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def add_url():
form = AddURLForm()
# If the form is submitted
if request.method == 'POST' and form.validate_on_submit():
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
url = form.url.data
html = fetch_html(url)
file = io.StringIO(html)
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')
filename = path_parts[-1]
if filename == '':
filename = 'index'
if not filename.endswith('.html'):
filename += '.html'
extension = 'html'
create_document_stack(form, file, filename, extension)
return redirect(url_for('document_bp.documents'))
return render_template('document/add_url.html', form=form)
@document_bp.route('/documents', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def documents():
@@ -122,6 +107,92 @@ def set_logging_information(obj, timestamp):
obj.updated_by = current_user.id
def create_document_stack(form, file, filename, extension):
# Create the Document
new_doc = create_document(form, filename)
# Create the DocumentLanguage
new_doc_lang = create_language_for_document(new_doc, form.language.data, form.user_context.data)
# Create the DocumentVersion
new_doc_vers = DocumentVersion()
new_doc_vers.document_language = new_doc_lang
set_logging_information(new_doc_vers, dt.now(tz.utc))
try:
db.session.add(new_doc)
db.session.add(new_doc_lang)
db.session.add(new_doc_vers)
log_session_state(db.session, "Before first commit")
db.session.commit()
log_session_state(db.session, "After first commit")
except SQLAlchemyError as e:
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
flash('Error adding document.', 'error')
db.session.rollback()
error = e.args
raise
except Exception as e:
current_app.logger.error('Unknown error')
raise
try:
new_doc_lang = db.session.merge(new_doc_lang)
new_doc_vers = db.session.merge(new_doc_vers)
new_doc_lang.latest_version_id = new_doc_vers.id
log_session_state(db.session, "Before second commit")
db.session.commit()
log_session_state(db.session, "After second commit")
except SQLAlchemyError as e:
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
flash('Error adding document.', 'error')
db.session.rollback()
error = e.args
raise
except Exception as e:
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
flash('Error adding document.', 'error')
db.session.rollback()
error = e.args
raise
current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc.id}')
upload_file_for_version(new_doc_vers, file, extension)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
session['default_embedding_model'],
])
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}')
def log_session_state(session, msg=""):
current_app.logger.debug(f"{msg} - Session dirty: {session.dirty}")
current_app.logger.debug(f"{msg} - Session new: {session.new}")
def create_document(form, filename):
new_doc = Document()
if form.name.data == '':
new_doc.name = filename.rsplit('.', 1)[0]
else:
new_doc.name = form.name.data
if form.valid_from.data or form.valid_from.data != '':
new_doc.valid_from = form.valid_from.data
else:
new_doc.valid_from = dt.now(tz.utc)
new_doc.tenant_id = session['tenant']['id']
set_logging_information(new_doc, dt.now(tz.utc))
return new_doc
def create_language_for_document(document, language, user_context):
new_doc_lang = DocumentLanguage()
if language == '':
@@ -140,7 +211,6 @@ def create_language_for_document(document, language, user_context):
def upload_file_for_version(doc_vers, file, extension):
error = None
doc_vers.file_type = extension
doc_vers.file_name = doc_vers.calc_file_name()
doc_vers.file_location = doc_vers.calc_file_location()
@@ -148,21 +218,45 @@ def upload_file_for_version(doc_vers, file, extension):
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], doc_vers.file_location)
if not os.path.exists(upload_path):
os.makedirs(upload_path, exist_ok=True)
file.save(os.path.join(upload_path, doc_vers.file_name))
if isinstance(file, FileStorage):
file.save(os.path.join(upload_path, doc_vers.file_name))
elif isinstance(file, io.StringIO):
# It's a StringIO object, handle accordingly
# Example: write content to a file manually
content = file.getvalue()
with open(os.path.join(upload_path, doc_vers.file_name), 'w', encoding='utf-8') as file:
file.write(content)
else:
raise TypeError('Unsupported file type.')
try:
db.session.commit()
except Exception as e:
except SQLAlchemyError as e:
db.session.rollback()
error = e.args
if error is None:
flash('Document saved successfully.', 'success')
current_app.logger.info(f'Starting Doucment processing for tenant {session['tenant']['id']} for document '
f'version {doc_vers.id}')
# TODO: processing of document to embeddings (async)
flash('Document processing started.', 'info')
else:
flash('Error saving document.', 'error')
current_app.logger.error(f'Error saving document for tenant {session["tenant"]["id"]}: {error}')
current_app.logger.error(
f'Error saving document for tenant {session["tenant"]["id"]} while uploading file: {error}')
current_app.logger.info(f'Succesfully saved document for tenant {session['tenant']['id']} for '
f'document version {doc_vers.id} while uploading file.')
def fetch_html(url):
# Fetches HTML content from a URL
try:
response = requests.get(url)
except SSLError as e:
current_app.logger.error(f"Error fetching HTML from {url} for tenant {session['tenant']['id']}. "
f"Error Encountered: {e}")
if current_app.config.get('DEBUG'): # only allow when in a development environment
current_app.logger.info(f"Skipping SSL verification for {url} for tenant {session['tenant']['id']}. "
f"Only while in development environment.")
response = requests.get(url, verify=False) # Disable SSL verification
else:
response = None
response.raise_for_status() # Will raise an exception for bad requests
return response.text
# Sample code for adding or updating versions and ensuring latest_version is set in DocumentLanguage
@@ -203,10 +297,10 @@ def prepare_document_data(docs):
# Latest version details if available (should be available ;-) )
if lang.latest_version:
lang_row.append({'value': lang.latest_version.created_at.strftime("%Y-%m-%d %H:%M:%S"),
'class': '', 'type': 'text'})
'class': '', 'type': 'text'})
if lang.latest_version.url:
lang_row.append({'value': lang.latest_version.url,
'class': '', 'type': 'link', 'href': lang.latest_version.url})
'class': '', 'type': 'link', 'href': lang.latest_version.url})
else:
lang_row.append({'value': '', 'class': '', 'type': 'text'})