Add functionality to add a URL to the system.
This commit is contained in:
@@ -52,15 +52,16 @@ class Config(object):
|
||||
CELERY_ENABLE_UTC = True
|
||||
|
||||
# LLM TEMPLATES
|
||||
GPT4_SUMMARY_TEMPLATE = """Summarise the text in the same language as the provided text between triple backquotes.
|
||||
```{context}```"""
|
||||
GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in the same language as the provided text.
|
||||
Text is delimited between triple backquotes.
|
||||
```{text}```"""
|
||||
|
||||
|
||||
class DevConfig(Config):
|
||||
DEVELOPMENT = True
|
||||
DEBUG = True
|
||||
FLASK_DEBUG = True
|
||||
PYCHARM_DEBUG = True
|
||||
PYCHARM_DEBUG = False
|
||||
SQLALCHEMY_DATABASE_URI = 'postgresql+pg8000://josako@localhost:5432/eveAI'
|
||||
SQLALCHEMY_BINDS = {'public': 'postgresql+pg8000://josako@localhost:5432/eveAI'}
|
||||
EXPLAIN_TEMPLATE_LOADING = False
|
||||
|
||||
24
eveai_app/templates/document/add_url.html
Normal file
24
eveai_app/templates/document/add_url.html
Normal file
@@ -0,0 +1,24 @@
|
||||
{% extends 'base.html' %}
|
||||
{% from "macros.html" import render_field %}
|
||||
|
||||
{% block title %}Add URL{% endblock %}
|
||||
|
||||
{% block content_title %}Add URL{% endblock %}
|
||||
{% block content_description %}Add a url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<form method="post" enctype="multipart/form-data">
|
||||
{{ form.hidden_tag() }}
|
||||
{% set disabled_fields = [] %}
|
||||
{% set exclude_fields = [] %}
|
||||
{% for field in form %}
|
||||
{{ render_field(field, disabled_fields, exclude_fields) }}
|
||||
{% endfor %}
|
||||
<button type="submit" class="btn btn-primary">Add URL</button>
|
||||
</form>
|
||||
{% endblock %}
|
||||
|
||||
|
||||
{% block content_footer %}
|
||||
|
||||
{% endblock %}
|
||||
@@ -78,6 +78,7 @@
|
||||
{% if current_user.is_authenticated %}
|
||||
{{ dropdown('Document Mgmt', 'contacts', [
|
||||
{'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']},
|
||||
{'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']},
|
||||
{'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']},
|
||||
]) }}
|
||||
{% endif %}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
from flask import session
|
||||
from flask_wtf import FlaskForm
|
||||
from wtforms import (StringField, BooleanField, SubmitField, DateField,
|
||||
SelectField, FieldList, FormField, TextAreaField)
|
||||
from wtforms.validators import DataRequired, Length, Optional
|
||||
SelectField, FieldList, FormField, TextAreaField, URLField)
|
||||
from wtforms.validators import DataRequired, Length, Optional, URL
|
||||
from flask_wtf.file import FileField, FileAllowed, FileRequired
|
||||
|
||||
|
||||
class AddDocumentForm(FlaskForm):
|
||||
file = FileField('File', validators=[FileAllowed(['pdf', 'txt']),
|
||||
file = FileField('File', validators=[FileAllowed(['pdf', 'txt', 'html']),
|
||||
FileRequired()])
|
||||
name = StringField('Name', validators=[Length(max=100)])
|
||||
language = SelectField('Language', choices=[], validators=[Optional()])
|
||||
@@ -27,4 +27,21 @@ class AddDocumentForm(FlaskForm):
|
||||
self.doc_embedding_model.data = session.get('default_embedding_model')
|
||||
|
||||
|
||||
class AddURLForm(FlaskForm):
|
||||
url = URLField('URL', validators=[DataRequired(), URL()])
|
||||
name = StringField('Name', validators=[Length(max=100)])
|
||||
language = SelectField('Language', choices=[], validators=[Optional()])
|
||||
user_context = TextAreaField('User Context', validators=[Optional()])
|
||||
valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()])
|
||||
doc_embedding_model = SelectField('Default Embedding Model', choices=[], validators=[DataRequired()])
|
||||
|
||||
submit = SubmitField('Submit')
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.language.choices = [(language, language) for language in
|
||||
session.get('tenant').get('allowed_languages')]
|
||||
self.language.data = session.get('default_language')
|
||||
self.doc_embedding_model.choices = [(model, model) for model in
|
||||
session.get('tenant').get('allowed_embedding_models')]
|
||||
self.doc_embedding_model.data = session.get('default_embedding_model')
|
||||
|
||||
@@ -4,15 +4,20 @@ from flask import request, redirect, url_for, flash, render_template, Blueprint,
|
||||
from flask_security import roles_accepted, current_user
|
||||
from sqlalchemy import desc
|
||||
from sqlalchemy.orm import joinedload
|
||||
from werkzeug.datastructures import FileStorage
|
||||
from werkzeug.utils import secure_filename
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
import requests
|
||||
from requests.exceptions import SSLError
|
||||
from urllib.parse import urlparse
|
||||
import io
|
||||
|
||||
from common.models.document import Document, DocumentLanguage, DocumentVersion
|
||||
from common.extensions import db
|
||||
from .document_forms import AddDocumentForm
|
||||
from .document_forms import AddDocumentForm, AddURLForm
|
||||
from common.utils.middleware import mw_before_request
|
||||
from common.utils.celery_utils import current_celery
|
||||
|
||||
|
||||
document_bp = Blueprint('document_bp', __name__, url_prefix='/document')
|
||||
|
||||
|
||||
@@ -25,69 +30,49 @@ def before_request():
|
||||
@roles_accepted('Super User', 'Tenant Admin')
|
||||
def add_document():
|
||||
form = AddDocumentForm()
|
||||
error = None
|
||||
|
||||
# If the form is submitted
|
||||
if request.method == 'POST' and form.validate_on_submit():
|
||||
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
|
||||
file = form.file.data
|
||||
filename = secure_filename(file.filename)
|
||||
extension = filename.rsplit('.', 1)[1].lower()
|
||||
|
||||
# Create the Document
|
||||
new_doc = Document()
|
||||
if form.name.data == '':
|
||||
new_doc.name = filename.rsplit('.', 1)[0]
|
||||
else:
|
||||
new_doc.name = form.name.data
|
||||
create_document_stack(form, file, filename, extension)
|
||||
|
||||
if form.valid_from.data or form.valid_from.data != '':
|
||||
new_doc.valid_from = form.valid_from.data
|
||||
else:
|
||||
new_doc.valid_from = dt.now(tz.utc)
|
||||
new_doc.tenant_id = session['tenant']['id']
|
||||
set_logging_information(new_doc, dt.now(tz.utc))
|
||||
|
||||
# Create the DocumentLanguage
|
||||
new_doc_lang = create_language_for_document(new_doc, form.language.data, form.user_context.data)
|
||||
|
||||
# Create the DocumentVersion
|
||||
new_doc_vers = DocumentVersion()
|
||||
new_doc_vers.document_language = new_doc_lang
|
||||
set_logging_information(new_doc_vers, dt.now(tz.utc))
|
||||
|
||||
try:
|
||||
db.session.add(new_doc)
|
||||
db.session.add(new_doc_lang)
|
||||
db.session.add(new_doc_vers)
|
||||
db.session.commit()
|
||||
new_doc_lang.latest_version = new_doc_vers
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
error = e.args
|
||||
|
||||
# Save the file and process the document
|
||||
if error is None:
|
||||
flash('Document added successfully.', 'success')
|
||||
current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
|
||||
f'Document Version {new_doc.id}')
|
||||
upload_file_for_version(new_doc_vers, file, extension)
|
||||
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
||||
session['tenant']['id'],
|
||||
new_doc_vers.id,
|
||||
session['default_embedding_model'],
|
||||
])
|
||||
current_app.logger.info(f'Document processing started for tenant {session["tenant"]["id"]}, '
|
||||
f'Document Version {new_doc_vers.id}, '
|
||||
f'Task ID {task.id}')
|
||||
print('Processing should start soon')
|
||||
else:
|
||||
flash('Error adding document.', 'error')
|
||||
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {error}')
|
||||
return redirect(url_for('document_bp/documents'))
|
||||
|
||||
return render_template('document/add_document.html', form=form)
|
||||
|
||||
|
||||
@document_bp.route('/add_url', methods=['GET', 'POST'])
|
||||
@roles_accepted('Super User', 'Tenant Admin')
|
||||
def add_url():
|
||||
form = AddURLForm()
|
||||
|
||||
# If the form is submitted
|
||||
if request.method == 'POST' and form.validate_on_submit():
|
||||
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
|
||||
url = form.url.data
|
||||
|
||||
html = fetch_html(url)
|
||||
file = io.StringIO(html)
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
path_parts = parsed_url.path.split('/')
|
||||
filename = path_parts[-1]
|
||||
if filename == '':
|
||||
filename = 'index'
|
||||
if not filename.endswith('.html'):
|
||||
filename += '.html'
|
||||
extension = 'html'
|
||||
|
||||
create_document_stack(form, file, filename, extension)
|
||||
return redirect(url_for('document_bp.documents'))
|
||||
|
||||
return render_template('document/add_url.html', form=form)
|
||||
|
||||
|
||||
@document_bp.route('/documents', methods=['GET', 'POST'])
|
||||
@roles_accepted('Super User', 'Tenant Admin')
|
||||
def documents():
|
||||
@@ -122,6 +107,92 @@ def set_logging_information(obj, timestamp):
|
||||
obj.updated_by = current_user.id
|
||||
|
||||
|
||||
def create_document_stack(form, file, filename, extension):
|
||||
# Create the Document
|
||||
new_doc = create_document(form, filename)
|
||||
|
||||
# Create the DocumentLanguage
|
||||
new_doc_lang = create_language_for_document(new_doc, form.language.data, form.user_context.data)
|
||||
|
||||
# Create the DocumentVersion
|
||||
new_doc_vers = DocumentVersion()
|
||||
new_doc_vers.document_language = new_doc_lang
|
||||
set_logging_information(new_doc_vers, dt.now(tz.utc))
|
||||
|
||||
try:
|
||||
db.session.add(new_doc)
|
||||
db.session.add(new_doc_lang)
|
||||
db.session.add(new_doc_vers)
|
||||
log_session_state(db.session, "Before first commit")
|
||||
db.session.commit()
|
||||
log_session_state(db.session, "After first commit")
|
||||
except SQLAlchemyError as e:
|
||||
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
|
||||
flash('Error adding document.', 'error')
|
||||
db.session.rollback()
|
||||
error = e.args
|
||||
raise
|
||||
except Exception as e:
|
||||
current_app.logger.error('Unknown error')
|
||||
raise
|
||||
|
||||
try:
|
||||
new_doc_lang = db.session.merge(new_doc_lang)
|
||||
new_doc_vers = db.session.merge(new_doc_vers)
|
||||
new_doc_lang.latest_version_id = new_doc_vers.id
|
||||
log_session_state(db.session, "Before second commit")
|
||||
db.session.commit()
|
||||
log_session_state(db.session, "After second commit")
|
||||
except SQLAlchemyError as e:
|
||||
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
|
||||
flash('Error adding document.', 'error')
|
||||
db.session.rollback()
|
||||
error = e.args
|
||||
raise
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
|
||||
flash('Error adding document.', 'error')
|
||||
db.session.rollback()
|
||||
error = e.args
|
||||
raise
|
||||
|
||||
current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
|
||||
f'Document Version {new_doc.id}')
|
||||
|
||||
upload_file_for_version(new_doc_vers, file, extension)
|
||||
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
||||
session['tenant']['id'],
|
||||
new_doc_vers.id,
|
||||
session['default_embedding_model'],
|
||||
])
|
||||
|
||||
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
|
||||
f'Document Version {new_doc_vers.id}. '
|
||||
f'Embedding creation task: {task.id}')
|
||||
|
||||
|
||||
def log_session_state(session, msg=""):
|
||||
current_app.logger.debug(f"{msg} - Session dirty: {session.dirty}")
|
||||
current_app.logger.debug(f"{msg} - Session new: {session.new}")
|
||||
|
||||
|
||||
def create_document(form, filename):
|
||||
new_doc = Document()
|
||||
if form.name.data == '':
|
||||
new_doc.name = filename.rsplit('.', 1)[0]
|
||||
else:
|
||||
new_doc.name = form.name.data
|
||||
|
||||
if form.valid_from.data or form.valid_from.data != '':
|
||||
new_doc.valid_from = form.valid_from.data
|
||||
else:
|
||||
new_doc.valid_from = dt.now(tz.utc)
|
||||
new_doc.tenant_id = session['tenant']['id']
|
||||
set_logging_information(new_doc, dt.now(tz.utc))
|
||||
|
||||
return new_doc
|
||||
|
||||
|
||||
def create_language_for_document(document, language, user_context):
|
||||
new_doc_lang = DocumentLanguage()
|
||||
if language == '':
|
||||
@@ -140,7 +211,6 @@ def create_language_for_document(document, language, user_context):
|
||||
|
||||
|
||||
def upload_file_for_version(doc_vers, file, extension):
|
||||
error = None
|
||||
doc_vers.file_type = extension
|
||||
doc_vers.file_name = doc_vers.calc_file_name()
|
||||
doc_vers.file_location = doc_vers.calc_file_location()
|
||||
@@ -148,21 +218,45 @@ def upload_file_for_version(doc_vers, file, extension):
|
||||
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], doc_vers.file_location)
|
||||
if not os.path.exists(upload_path):
|
||||
os.makedirs(upload_path, exist_ok=True)
|
||||
file.save(os.path.join(upload_path, doc_vers.file_name))
|
||||
if isinstance(file, FileStorage):
|
||||
file.save(os.path.join(upload_path, doc_vers.file_name))
|
||||
elif isinstance(file, io.StringIO):
|
||||
# It's a StringIO object, handle accordingly
|
||||
# Example: write content to a file manually
|
||||
content = file.getvalue()
|
||||
with open(os.path.join(upload_path, doc_vers.file_name), 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
else:
|
||||
raise TypeError('Unsupported file type.')
|
||||
|
||||
try:
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
except SQLAlchemyError as e:
|
||||
db.session.rollback()
|
||||
error = e.args
|
||||
if error is None:
|
||||
flash('Document saved successfully.', 'success')
|
||||
current_app.logger.info(f'Starting Doucment processing for tenant {session['tenant']['id']} for document '
|
||||
f'version {doc_vers.id}')
|
||||
# TODO: processing of document to embeddings (async)
|
||||
flash('Document processing started.', 'info')
|
||||
else:
|
||||
flash('Error saving document.', 'error')
|
||||
current_app.logger.error(f'Error saving document for tenant {session["tenant"]["id"]}: {error}')
|
||||
current_app.logger.error(
|
||||
f'Error saving document for tenant {session["tenant"]["id"]} while uploading file: {error}')
|
||||
|
||||
current_app.logger.info(f'Succesfully saved document for tenant {session['tenant']['id']} for '
|
||||
f'document version {doc_vers.id} while uploading file.')
|
||||
|
||||
|
||||
def fetch_html(url):
|
||||
# Fetches HTML content from a URL
|
||||
try:
|
||||
response = requests.get(url)
|
||||
except SSLError as e:
|
||||
current_app.logger.error(f"Error fetching HTML from {url} for tenant {session['tenant']['id']}. "
|
||||
f"Error Encountered: {e}")
|
||||
if current_app.config.get('DEBUG'): # only allow when in a development environment
|
||||
current_app.logger.info(f"Skipping SSL verification for {url} for tenant {session['tenant']['id']}. "
|
||||
f"Only while in development environment.")
|
||||
response = requests.get(url, verify=False) # Disable SSL verification
|
||||
else:
|
||||
response = None
|
||||
|
||||
response.raise_for_status() # Will raise an exception for bad requests
|
||||
return response.text
|
||||
|
||||
|
||||
# Sample code for adding or updating versions and ensuring latest_version is set in DocumentLanguage
|
||||
@@ -203,10 +297,10 @@ def prepare_document_data(docs):
|
||||
# Latest version details if available (should be available ;-) )
|
||||
if lang.latest_version:
|
||||
lang_row.append({'value': lang.latest_version.created_at.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
'class': '', 'type': 'text'})
|
||||
'class': '', 'type': 'text'})
|
||||
if lang.latest_version.url:
|
||||
lang_row.append({'value': lang.latest_version.url,
|
||||
'class': '', 'type': 'link', 'href': lang.latest_version.url})
|
||||
'class': '', 'type': 'link', 'href': lang.latest_version.url})
|
||||
else:
|
||||
lang_row.append({'value': '', 'class': '', 'type': 'text'})
|
||||
|
||||
|
||||
@@ -7,3 +7,5 @@ pgvector~=0.2.5
|
||||
gevent~=24.2.1
|
||||
celery~=5.4.0
|
||||
kombu~=5.3.7
|
||||
langchain~=0.1.17
|
||||
requests~=2.31.0
|
||||
Reference in New Issue
Block a user