diff --git a/config/config.py b/config/config.py index 9193b17..bfceda5 100644 --- a/config/config.py +++ b/config/config.py @@ -52,15 +52,16 @@ class Config(object): CELERY_ENABLE_UTC = True # LLM TEMPLATES - GPT4_SUMMARY_TEMPLATE = """Summarise the text in the same language as the provided text between triple backquotes. - ```{context}```""" + GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in the same language as the provided text. + Text is delimited between triple backquotes. + ```{text}```""" class DevConfig(Config): DEVELOPMENT = True DEBUG = True FLASK_DEBUG = True - PYCHARM_DEBUG = True + PYCHARM_DEBUG = False SQLALCHEMY_DATABASE_URI = 'postgresql+pg8000://josako@localhost:5432/eveAI' SQLALCHEMY_BINDS = {'public': 'postgresql+pg8000://josako@localhost:5432/eveAI'} EXPLAIN_TEMPLATE_LOADING = False diff --git a/eveai_app/templates/document/add_url.html b/eveai_app/templates/document/add_url.html new file mode 100644 index 0000000..b90cca3 --- /dev/null +++ b/eveai_app/templates/document/add_url.html @@ -0,0 +1,24 @@ +{% extends 'base.html' %} +{% from "macros.html" import render_field %} + +{% block title %}Add URL{% endblock %} + +{% block content_title %}Add URL{% endblock %} +{% block content_description %}Add a url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %} + +{% block content %} +
+ {{ form.hidden_tag() }} + {% set disabled_fields = [] %} + {% set exclude_fields = [] %} + {% for field in form %} + {{ render_field(field, disabled_fields, exclude_fields) }} + {% endfor %} + +
+{% endblock %} + + +{% block content_footer %} + +{% endblock %} \ No newline at end of file diff --git a/eveai_app/templates/navbar.html b/eveai_app/templates/navbar.html index 83d3ce3..b0118fe 100644 --- a/eveai_app/templates/navbar.html +++ b/eveai_app/templates/navbar.html @@ -78,6 +78,7 @@ {% if current_user.is_authenticated %} {{ dropdown('Document Mgmt', 'contacts', [ {'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']}, + {'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']}, {'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']}, ]) }} {% endif %} diff --git a/eveai_app/views/document_forms.py b/eveai_app/views/document_forms.py index 3b0a670..6346dde 100644 --- a/eveai_app/views/document_forms.py +++ b/eveai_app/views/document_forms.py @@ -1,13 +1,13 @@ from flask import session from flask_wtf import FlaskForm from wtforms import (StringField, BooleanField, SubmitField, DateField, - SelectField, FieldList, FormField, TextAreaField) -from wtforms.validators import DataRequired, Length, Optional + SelectField, FieldList, FormField, TextAreaField, URLField) +from wtforms.validators import DataRequired, Length, Optional, URL from flask_wtf.file import FileField, FileAllowed, FileRequired class AddDocumentForm(FlaskForm): - file = FileField('File', validators=[FileAllowed(['pdf', 'txt']), + file = FileField('File', validators=[FileAllowed(['pdf', 'txt', 'html']), FileRequired()]) name = StringField('Name', validators=[Length(max=100)]) language = SelectField('Language', choices=[], validators=[Optional()]) @@ -27,4 +27,21 @@ class AddDocumentForm(FlaskForm): self.doc_embedding_model.data = session.get('default_embedding_model') +class AddURLForm(FlaskForm): + url = URLField('URL', validators=[DataRequired(), URL()]) + name = StringField('Name', validators=[Length(max=100)]) + language = SelectField('Language', choices=[], validators=[Optional()]) + user_context = TextAreaField('User Context', validators=[Optional()]) + valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()]) + doc_embedding_model = SelectField('Default Embedding Model', choices=[], validators=[DataRequired()]) + submit = SubmitField('Submit') + + def __init__(self): + super().__init__() + self.language.choices = [(language, language) for language in + session.get('tenant').get('allowed_languages')] + self.language.data = session.get('default_language') + self.doc_embedding_model.choices = [(model, model) for model in + session.get('tenant').get('allowed_embedding_models')] + self.doc_embedding_model.data = session.get('default_embedding_model') diff --git a/eveai_app/views/document_views.py b/eveai_app/views/document_views.py index 7c68a4a..b68daa3 100644 --- a/eveai_app/views/document_views.py +++ b/eveai_app/views/document_views.py @@ -4,15 +4,20 @@ from flask import request, redirect, url_for, flash, render_template, Blueprint, from flask_security import roles_accepted, current_user from sqlalchemy import desc from sqlalchemy.orm import joinedload +from werkzeug.datastructures import FileStorage from werkzeug.utils import secure_filename +from sqlalchemy.exc import SQLAlchemyError +import requests +from requests.exceptions import SSLError +from urllib.parse import urlparse +import io from common.models.document import Document, DocumentLanguage, DocumentVersion from common.extensions import db -from .document_forms import AddDocumentForm +from .document_forms import AddDocumentForm, AddURLForm from common.utils.middleware import mw_before_request from common.utils.celery_utils import current_celery - document_bp = Blueprint('document_bp', __name__, url_prefix='/document') @@ -25,69 +30,49 @@ def before_request(): @roles_accepted('Super User', 'Tenant Admin') def add_document(): form = AddDocumentForm() - error = None # If the form is submitted if request.method == 'POST' and form.validate_on_submit(): + current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}') file = form.file.data filename = secure_filename(file.filename) extension = filename.rsplit('.', 1)[1].lower() - # Create the Document - new_doc = Document() - if form.name.data == '': - new_doc.name = filename.rsplit('.', 1)[0] - else: - new_doc.name = form.name.data + create_document_stack(form, file, filename, extension) - if form.valid_from.data or form.valid_from.data != '': - new_doc.valid_from = form.valid_from.data - else: - new_doc.valid_from = dt.now(tz.utc) - new_doc.tenant_id = session['tenant']['id'] - set_logging_information(new_doc, dt.now(tz.utc)) - - # Create the DocumentLanguage - new_doc_lang = create_language_for_document(new_doc, form.language.data, form.user_context.data) - - # Create the DocumentVersion - new_doc_vers = DocumentVersion() - new_doc_vers.document_language = new_doc_lang - set_logging_information(new_doc_vers, dt.now(tz.utc)) - - try: - db.session.add(new_doc) - db.session.add(new_doc_lang) - db.session.add(new_doc_vers) - db.session.commit() - new_doc_lang.latest_version = new_doc_vers - db.session.commit() - except Exception as e: - db.session.rollback() - error = e.args - - # Save the file and process the document - if error is None: - flash('Document added successfully.', 'success') - current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, ' - f'Document Version {new_doc.id}') - upload_file_for_version(new_doc_vers, file, extension) - task = current_celery.send_task('create_embeddings', queue='embeddings', args=[ - session['tenant']['id'], - new_doc_vers.id, - session['default_embedding_model'], - ]) - current_app.logger.info(f'Document processing started for tenant {session["tenant"]["id"]}, ' - f'Document Version {new_doc_vers.id}, ' - f'Task ID {task.id}') - print('Processing should start soon') - else: - flash('Error adding document.', 'error') - current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {error}') + return redirect(url_for('document_bp/documents')) return render_template('document/add_document.html', form=form) +@document_bp.route('/add_url', methods=['GET', 'POST']) +@roles_accepted('Super User', 'Tenant Admin') +def add_url(): + form = AddURLForm() + + # If the form is submitted + if request.method == 'POST' and form.validate_on_submit(): + current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}') + url = form.url.data + + html = fetch_html(url) + file = io.StringIO(html) + + parsed_url = urlparse(url) + path_parts = parsed_url.path.split('/') + filename = path_parts[-1] + if filename == '': + filename = 'index' + if not filename.endswith('.html'): + filename += '.html' + extension = 'html' + + create_document_stack(form, file, filename, extension) + return redirect(url_for('document_bp.documents')) + + return render_template('document/add_url.html', form=form) + + @document_bp.route('/documents', methods=['GET', 'POST']) @roles_accepted('Super User', 'Tenant Admin') def documents(): @@ -122,6 +107,92 @@ def set_logging_information(obj, timestamp): obj.updated_by = current_user.id +def create_document_stack(form, file, filename, extension): + # Create the Document + new_doc = create_document(form, filename) + + # Create the DocumentLanguage + new_doc_lang = create_language_for_document(new_doc, form.language.data, form.user_context.data) + + # Create the DocumentVersion + new_doc_vers = DocumentVersion() + new_doc_vers.document_language = new_doc_lang + set_logging_information(new_doc_vers, dt.now(tz.utc)) + + try: + db.session.add(new_doc) + db.session.add(new_doc_lang) + db.session.add(new_doc_vers) + log_session_state(db.session, "Before first commit") + db.session.commit() + log_session_state(db.session, "After first commit") + except SQLAlchemyError as e: + current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}') + flash('Error adding document.', 'error') + db.session.rollback() + error = e.args + raise + except Exception as e: + current_app.logger.error('Unknown error') + raise + + try: + new_doc_lang = db.session.merge(new_doc_lang) + new_doc_vers = db.session.merge(new_doc_vers) + new_doc_lang.latest_version_id = new_doc_vers.id + log_session_state(db.session, "Before second commit") + db.session.commit() + log_session_state(db.session, "After second commit") + except SQLAlchemyError as e: + current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}') + flash('Error adding document.', 'error') + db.session.rollback() + error = e.args + raise + except Exception as e: + current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}') + flash('Error adding document.', 'error') + db.session.rollback() + error = e.args + raise + + current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, ' + f'Document Version {new_doc.id}') + + upload_file_for_version(new_doc_vers, file, extension) + task = current_celery.send_task('create_embeddings', queue='embeddings', args=[ + session['tenant']['id'], + new_doc_vers.id, + session['default_embedding_model'], + ]) + + current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, ' + f'Document Version {new_doc_vers.id}. ' + f'Embedding creation task: {task.id}') + + +def log_session_state(session, msg=""): + current_app.logger.debug(f"{msg} - Session dirty: {session.dirty}") + current_app.logger.debug(f"{msg} - Session new: {session.new}") + + +def create_document(form, filename): + new_doc = Document() + if form.name.data == '': + new_doc.name = filename.rsplit('.', 1)[0] + else: + new_doc.name = form.name.data + + if form.valid_from.data or form.valid_from.data != '': + new_doc.valid_from = form.valid_from.data + else: + new_doc.valid_from = dt.now(tz.utc) + new_doc.tenant_id = session['tenant']['id'] + set_logging_information(new_doc, dt.now(tz.utc)) + + return new_doc + + def create_language_for_document(document, language, user_context): new_doc_lang = DocumentLanguage() if language == '': @@ -140,7 +211,6 @@ def create_language_for_document(document, language, user_context): def upload_file_for_version(doc_vers, file, extension): - error = None doc_vers.file_type = extension doc_vers.file_name = doc_vers.calc_file_name() doc_vers.file_location = doc_vers.calc_file_location() @@ -148,21 +218,45 @@ def upload_file_for_version(doc_vers, file, extension): upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], doc_vers.file_location) if not os.path.exists(upload_path): os.makedirs(upload_path, exist_ok=True) - file.save(os.path.join(upload_path, doc_vers.file_name)) + if isinstance(file, FileStorage): + file.save(os.path.join(upload_path, doc_vers.file_name)) + elif isinstance(file, io.StringIO): + # It's a StringIO object, handle accordingly + # Example: write content to a file manually + content = file.getvalue() + with open(os.path.join(upload_path, doc_vers.file_name), 'w', encoding='utf-8') as file: + file.write(content) + else: + raise TypeError('Unsupported file type.') + try: db.session.commit() - except Exception as e: + except SQLAlchemyError as e: db.session.rollback() - error = e.args - if error is None: - flash('Document saved successfully.', 'success') - current_app.logger.info(f'Starting Doucment processing for tenant {session['tenant']['id']} for document ' - f'version {doc_vers.id}') - # TODO: processing of document to embeddings (async) - flash('Document processing started.', 'info') - else: flash('Error saving document.', 'error') - current_app.logger.error(f'Error saving document for tenant {session["tenant"]["id"]}: {error}') + current_app.logger.error( + f'Error saving document for tenant {session["tenant"]["id"]} while uploading file: {error}') + + current_app.logger.info(f'Succesfully saved document for tenant {session['tenant']['id']} for ' + f'document version {doc_vers.id} while uploading file.') + + +def fetch_html(url): + # Fetches HTML content from a URL + try: + response = requests.get(url) + except SSLError as e: + current_app.logger.error(f"Error fetching HTML from {url} for tenant {session['tenant']['id']}. " + f"Error Encountered: {e}") + if current_app.config.get('DEBUG'): # only allow when in a development environment + current_app.logger.info(f"Skipping SSL verification for {url} for tenant {session['tenant']['id']}. " + f"Only while in development environment.") + response = requests.get(url, verify=False) # Disable SSL verification + else: + response = None + + response.raise_for_status() # Will raise an exception for bad requests + return response.text # Sample code for adding or updating versions and ensuring latest_version is set in DocumentLanguage @@ -203,10 +297,10 @@ def prepare_document_data(docs): # Latest version details if available (should be available ;-) ) if lang.latest_version: lang_row.append({'value': lang.latest_version.created_at.strftime("%Y-%m-%d %H:%M:%S"), - 'class': '', 'type': 'text'}) + 'class': '', 'type': 'text'}) if lang.latest_version.url: lang_row.append({'value': lang.latest_version.url, - 'class': '', 'type': 'link', 'href': lang.latest_version.url}) + 'class': '', 'type': 'link', 'href': lang.latest_version.url}) else: lang_row.append({'value': '', 'class': '', 'type': 'text'}) diff --git a/requirements.txt b/requirements.txt index 07c9bda..6ec3822 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,6 @@ Werkzeug~=3.0.2 pgvector~=0.2.5 gevent~=24.2.1 celery~=5.4.0 -kombu~=5.3.7 \ No newline at end of file +kombu~=5.3.7 +langchain~=0.1.17 +requests~=2.31.0 \ No newline at end of file