Improvements to Document Interface and correcting embedding workers

This commit is contained in:
Josako
2024-06-04 14:59:38 +02:00
parent c660c35de4
commit 61e1372dc8
15 changed files with 486 additions and 246 deletions

View File

@@ -4,7 +4,7 @@ for handling tenant requests
""" """
from flask_security import current_user from flask_security import current_user
from flask import session from flask import session, current_app
from .database import Database from .database import Database
@@ -17,12 +17,15 @@ def mw_before_request():
tenant_id = session['tenant']['id'] tenant_id = session['tenant']['id']
if not tenant_id: if not tenant_id:
return {"message": "You are not logged into any tenant"}, 403 raise Exception('Cannot switch schema for tenant: no tenant defined in session')
for role in current_user.roles:
current_app.logger.debug(f'In middleware: User {current_user.email} has role {role.name}')
# user = User.query.get(current_user.id) # user = User.query.get(current_user.id)
if current_user.has_roles(['Super User']) or current_user.tenant_id == tenant_id: if current_user.has_role('Super User') or current_user.tenant_id == tenant_id:
Database(tenant_id).switch_schema() Database(tenant_id).switch_schema()
else: else:
return {"message": "You are not a member of this tenant"}, 403 raise Exception(f'Cannot switch schema for tenant {tenant_id}: user {current_user.email} does not have access')

View File

@@ -0,0 +1,79 @@
from flask import current_app
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from common.models.document import EmbeddingSmallOpenAI
def select_model_variables(tenant):
embedding_provider = tenant.embedding_model.rsplit('.', 1)[0]
embedding_model = tenant.embedding_model.rsplit('.', 1)[1]
llm_provider = tenant.llm_model.rsplit('.', 1)[0]
llm_model = tenant.llm_model.rsplit('.', 1)[1]
# Set model variables
model_variables = {}
if tenant.es_k:
model_variables['k'] = tenant.es_k
else:
model_variables['k'] = 5
if tenant.es_similarity_threshold:
model_variables['similarity_threshold'] = tenant.es_similarity_threshold
else:
model_variables['similarity_threshold'] = 0.7
if tenant.chat_RAG_temperature:
model_variables['RAG_temperature'] = tenant.chat_RAG_temperature
else:
model_variables['RAG_temperature'] = 0.3
if tenant.chat_no_RAG_temperature:
model_variables['no_RAG_temperature'] = tenant.chat_no_RAG_temperature
else:
model_variables['no_RAG_temperature'] = 0.5
# Set Embedding variables
match embedding_provider:
case 'openai':
match embedding_model:
case 'text-embedding-3-small':
api_key = current_app.config.get('OPENAI_API_KEY')
model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key,
model='text-embedding-3-small')
model_variables['embedding_db_model'] = EmbeddingSmallOpenAI
model_variables['min_chunk_size'] = current_app.config.get('OAI_TE3S_MIN_CHUNK_SIZE')
model_variables['max_chunk_size'] = current_app.config.get('OAI_TE3S_MAX_CHUNK_SIZE')
case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid embedding model')
case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid embedding provider')
# Set Chat model variables
match llm_provider:
case 'openai':
api_key = current_app.config.get('OPENAI_API_KEY')
model_variables['llm'] = ChatOpenAI(api_key=api_key,
model=llm_model,
temperature=model_variables['RAG_temperature'])
match llm_model:
case 'gpt-4-turbo' | 'gpt-4o':
summary_template = current_app.config.get('GPT4_SUMMARY_TEMPLATE')
rag_template = current_app.config.get('GPT4_RAG_TEMPLATE')
case 'gpt-3-5-turbo':
summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE')
case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat model')
model_variables['summary_prompt'] = ChatPromptTemplate.from_template(summary_template)
model_variables['rag_prompt'] = ChatPromptTemplate.from_template(rag_template)
case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat provider')
return model_variables

View File

@@ -69,8 +69,8 @@ class Config(object):
CELERY_ENABLE_UTC = True CELERY_ENABLE_UTC = True
# Chunk Definition, Embedding dependent # Chunk Definition, Embedding dependent
O_TE3SMALL_MIN_CHUNK_SIZE = 2000 OAI_TE3S_MIN_CHUNK_SIZE = 2000
O_TE3SMALL_MAX_CHUNK_SIZE = 3000 OAI_TE3S_MAX_CHUNK_SIZE = 3000
# LLM TEMPLATES # LLM TEMPLATES
GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in the same language as the provided text. GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in the same language as the provided text.

View File

@@ -70,7 +70,7 @@ def create_app(config_file=None):
security_logger.setLevel(logging.DEBUG) security_logger.setLevel(logging.DEBUG)
sqlalchemy_logger = logging.getLogger('sqlalchemy.engine') sqlalchemy_logger = logging.getLogger('sqlalchemy.engine')
sqlalchemy_logger.setLevel(logging.DEBUG) sqlalchemy_logger.setLevel(logging.DEBUG)
log_request_middleware(app) # Add this when debugging nginx or another proxy # log_request_middleware(app) # Add this when debugging nginx or another proxy
# Some generic Error Handling Routines # Some generic Error Handling Routines
@app.errorhandler(Exception) @app.errorhandler(Exception)

View File

@@ -0,0 +1,24 @@
{% extends 'base.html' %}
{% from 'macros.html' import render_selectable_table, render_pagination %}
{% block title %}Document Languages{% endblock %}
{% block content_title %}Document Languages{% endblock %}
{% block content_description %}View Languages for {{ document }}{% endblock %}
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto">{% endblock %}
{% block content %}
<div class="container">
<form method="POST" action="{{ url_for('document_bp.handle_document_language_selection') }}">
{{ render_selectable_table(headers=["Document Language ID", "Language", "User Context", "System Context"], rows=rows, selectable=True, id="documentsTable") }}
<div class="form-group mt-3">
<button type="submit" name="action" value="edit_document_language" class="btn btn-primary">Edit Document Language</button>
<button type="submit" name="action" value="document_versions" class="btn btn-secondary">Show Document Versions</button>
</div>
</form>
</div>
{% endblock %}
{% block content_footer %}
{{ render_pagination(pagination, 'document_bp.documents') }}
{% endblock %}

View File

@@ -0,0 +1,23 @@
{% extends 'base.html' %}
{% from 'macros.html' import render_selectable_table, render_pagination %}
{% block title %}Document Versions{% endblock %}
{% block content_title %}Document Versions{% endblock %}
{% block content_description %}View Versions for {{ document }}{% endblock %}
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto">{% endblock %}
{% block content %}
<div class="container">
<form method="POST" action="{{ url_for('document_bp.handle_document_version_selection') }}">
{{ render_selectable_table(headers=["Document Version ID", "URL", "File Location", "File Name", "File Type", "Processing", "Processing Start", "Proceeing Finish"], rows=rows, selectable=True, id="versionsTable") }}
<div class="form-group mt-3">
<button type="submit" name="action" value="process_document_version" class="btn btn-primary">Process Document Version</button>
</div>
</form>
</div>
{% endblock %}
{% block content_footer %}
{{ render_pagination(pagination, 'document_bp.documents') }}
{% endblock %}

View File

@@ -1,5 +1,5 @@
{% extends 'base.html' %} {% extends 'base.html' %}
{% from 'macros.html' import render_nested_table, render_pagination %} {% from 'macros.html' import render_selectable_table, render_pagination %}
{% block title %}Documents{% endblock %} {% block title %}Documents{% endblock %}
@@ -9,8 +9,13 @@
{% block content %} {% block content %}
<div class="container"> <div class="container">
<!-- Documents Table --> <form method="POST" action="{{ url_for('document_bp.handle_document_selection') }}">
{{ render_nested_table(headers=["Name", "Created At", "Valid From", "Languages & Versions"], rows=rows) }} {{ render_selectable_table(headers=["Document ID", "Name", "Valid From", "Valid To"], rows=rows, selectable=True, id="documentsTable") }}
<div class="form-group mt-3">
<button type="submit" name="action" value="edit_document" class="btn btn-primary">Edit Document</button>
<button type="submit" name="action" value="document_languages" class="btn btn-secondary">Show Document Languages</button>
</div>
</form>
</div> </div>
{% endblock %} {% endblock %}

View File

@@ -0,0 +1,18 @@
{% extends "base.html" %}
{% from "macros.html" import render_field %}
{% block title %}Update Document{% endblock %}
{% block content_title %}Update Document{% endblock %}
{% block content_description %}Update document details.{% endblock %}
{% block content %}
<form method="post">
{{ form.hidden_tag() }}
{% set disabled_fields = [] %}
{% set exclude_fields = [] %}
{% for field in form %}
{{ render_field(field, disabled_fields, exclude_fields) }}
{% endfor %}
<button type="submit" class="btn btn-primary">Update Document</button>
</form>
{% endblock %}

View File

@@ -0,0 +1,18 @@
{% extends "base.html" %}
{% from "macros.html" import render_field %}
{% block title %}Update Document Language{% endblock %}
{% block content_title %}Update Document Language{% endblock %}
{% block content_description %}Update document language for {{ doc_details }}.{% endblock %}
{% block content %}
<form method="post">
{{ form.hidden_tag() }}
{% set disabled_fields = ['language', 'system_context'] %}
{% set exclude_fields = [] %}
{% for field in form %}
{{ render_field(field, disabled_fields, exclude_fields) }}
{% endfor %}
<button type="submit" class="btn btn-primary">Update Document</button>
</form>
{% endblock %}

View File

@@ -217,57 +217,6 @@
</div> </div>
{% endmacro %} {% endmacro %}
{% macro render_seamless_table(headers, rows) %}
{% macro render_integrated_table(headers, data) %}
<div class="table-responsive">
<table class="table align-items-center mb-0">
<thead>
<tr>
{% for header in headers %}
<th class="text-uppercase text-secondary text-xxs font-weight-bolder opacity-7">{{ header }}</th>
{% endfor %}
</tr>
</thead>
<tbody>
{% for entry in data %}
{% if entry.is_group and entry.sub_rows %}
{% for sub_row in entry.sub_rows %}
<tr>
{% for cell in sub_row %}
{% if cell %}
<td class="{{ cell.class }}">
{% if cell.type == 'text' %}
<p class="text-xs {{ cell.text_class }}">{{ cell.value }}</p>
{% else %}
{{ cell.value }}
{% endif %}
</td>
{% else %}
<td></td>
{% endif %}
{% endfor %}
</tr>
{% endfor %}
{% else %}
<tr>
{% for cell in entry %}
<td class="{{ cell.class }}">
{% if cell.type == 'text' %}
<p class="text-xs {{ cell.text_class }}">{{ cell.value }}</p>
{% else %}
{{ cell.value }}
{% endif %}
</td>
{% endfor %}
</tr>
{% endif %}
{% endfor %}
</tbody>
</table>
</div>
{% endmacro %}
{% endmacro %}
{% macro render_pagination(pagination, endpoint) %} {% macro render_pagination(pagination, endpoint) %}
<nav aria-label="Page navigation"> <nav aria-label="Page navigation">
<ul class="pagination"> <ul class="pagination">

View File

@@ -20,32 +20,3 @@
{{ render_pagination(pagination, 'user_bp.select_tenant') }} {{ render_pagination(pagination, 'user_bp.select_tenant') }}
{% endblock %} {% endblock %}
{#{% block scripts %}#}
{#<script>#}
{#$(document).ready(function() {#}
{# $('#tenantsTable').DataTable({#}
{# 'columnDefs': [#}
{# {#}
{# 'targets': 0,#}
{# 'searchable': false,#}
{# 'orderable': false,#}
{# 'className': 'dt-body-center',#}
{# },#}
{# {#}
{# 'targets': 1,#}
{# 'orderable': true#}
{# },#}
{# {#}
{# 'targets': 2,#}
{# 'orderable': true#}
{# },#}
{# {#}
{# 'targets': 2,#}
{# 'orderable': true#}
{# },#}
{# ],#}
{# 'order': [[1, 'asc']]#}
{# });#}
{#});#}
{#</script>#}
{#{% endblock %}#}

View File

@@ -37,3 +37,22 @@ class AddURLForm(FlaskForm):
self.language.choices = [(language, language) for language in self.language.choices = [(language, language) for language in
session.get('tenant').get('allowed_languages')] session.get('tenant').get('allowed_languages')]
self.language.data = session.get('default_language') self.language.data = session.get('default_language')
class EditDocumentForm(FlaskForm):
name = StringField('Name', validators=[Length(max=100)])
valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()])
valid_to = DateField('Valid to', id='form-control datepicker', validators=[Optional()])
submit = SubmitField('Submit')
class EditDocumentLanguageForm(FlaskForm):
language = StringField('Language')
user_context = TextAreaField('User Context', validators=[Optional()])
system_context = TextAreaField('System Context', validators=[Optional()])
submit = SubmitField('Submit')

View File

@@ -1,3 +1,4 @@
import ast
import os import os
from datetime import datetime as dt, timezone as tz from datetime import datetime as dt, timezone as tz
from flask import request, redirect, flash, render_template, Blueprint, session, current_app from flask import request, redirect, flash, render_template, Blueprint, session, current_app
@@ -14,10 +15,11 @@ import io
from common.models.document import Document, DocumentLanguage, DocumentVersion from common.models.document import Document, DocumentLanguage, DocumentVersion
from common.extensions import db from common.extensions import db
from .document_forms import AddDocumentForm, AddURLForm from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentLanguageForm
from common.utils.middleware import mw_before_request from common.utils.middleware import mw_before_request
from common.utils.celery_utils import current_celery from common.utils.celery_utils import current_celery
from common.utils.nginx_utils import prefixed_url_for from common.utils.nginx_utils import prefixed_url_for
from common.utils.view_assistants import form_validation_failed, prepare_table_for_macro
document_bp = Blueprint('document_bp', __name__, url_prefix='/document') document_bp = Blueprint('document_bp', __name__, url_prefix='/document')
@@ -29,13 +31,20 @@ def log_before_request():
@document_bp.after_request @document_bp.after_request
def log_after_request(response): def log_after_request(response):
current_app.logger.debug(f"After request (document_bp): {request.method} {request.url} - Status: {response.status}") current_app.logger.debug(
f"After request (document_bp): {request.method} {request.url} - Status: {response.status}")
return response return response
@document_bp.before_request @document_bp.before_request
def before_request(): def before_request():
mw_before_request() try:
mw_before_request()
except Exception as e:
current_app.logger.error(f'Error switching schema in Document Blueprint: {e}')
for role in current_user.roles:
current_app.logger.debug(f'User {current_user.email} has role {role.name}')
raise
@document_bp.route('/add_document', methods=['GET', 'POST']) @document_bp.route('/add_document', methods=['GET', 'POST'])
@@ -44,7 +53,7 @@ def add_document():
form = AddDocumentForm() form = AddDocumentForm()
# If the form is submitted # If the form is submitted
if request.method == 'POST' and form.validate_on_submit(): if form.validate_on_submit():
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}') current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
file = form.file.data file = form.file.data
filename = secure_filename(file.filename) filename = secure_filename(file.filename)
@@ -59,8 +68,12 @@ def add_document():
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, ' current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. ' f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}') f'Embedding creation task: {task.id}')
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
'success')
return redirect(prefixed_url_for('document_bp.documents')) return redirect(prefixed_url_for('document_bp.documents'))
else:
form_validation_failed(request, form)
return render_template('document/add_document.html', form=form) return render_template('document/add_document.html', form=form)
@@ -71,7 +84,7 @@ def add_url():
form = AddURLForm() form = AddURLForm()
# If the form is submitted # If the form is submitted
if request.method == 'POST' and form.validate_on_submit(): if form.validate_on_submit():
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}') current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
url = form.url.data url = form.url.data
@@ -96,8 +109,12 @@ def add_url():
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, ' current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. ' f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}') f'Embedding creation task: {task.id}')
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
'success')
return redirect(prefixed_url_for('document_bp.documents')) return redirect(prefixed_url_for('document_bp.documents'))
else:
form_validation_failed(request, form)
return render_template('document/add_url.html', form=form) return render_template('document/add_url.html', form=form)
@@ -108,25 +125,178 @@ def documents():
page = request.args.get('page', 1, type=int) page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int) per_page = request.args.get('per_page', 10, type=int)
query = Document.query.order_by(desc(Document.created_at)).options( query = Document.query.order_by(desc(Document.created_at))
joinedload(Document.languages).joinedload(DocumentLanguage.versions))
pagination = query.paginate(page=page, per_page=per_page, error_out=False) pagination = query.paginate(page=page, per_page=per_page, error_out=False)
docs = pagination.items docs = pagination.items
rows = prepare_document_data(docs) rows = prepare_table_for_macro(docs, [('id', ''), ('name', ''), ('valid_from', ''), ('valid_to', '')])
return render_template('document/documents.html', rows=rows, pagination=pagination) return render_template('document/documents.html', rows=rows, pagination=pagination)
@document_bp.route('/process_version/<int:version_id>', methods=['POST']) @document_bp.route('/handle_document_selection', methods=['POST'])
@roles_accepted('Super User', 'Tenant Admin') @roles_accepted('Super User', 'Tenant Admin')
def process_version(version_id): def handle_document_selection():
version = DocumentVersion.query.get_or_404(version_id) document_identification = request.form['selected_row']
if not version.processing: doc_id = ast.literal_eval(document_identification).get('value')
print(f'Placeholder for processing version: {version_id}')
return redirect(prefixed_url_for('documents')) action = request.form['action']
match action:
case 'edit_document':
return redirect(prefixed_url_for('document_bp.edit_document', document_id=doc_id))
case 'document_languages':
return redirect(prefixed_url_for('document_bp.document_languages', document_id=doc_id))
# Add more conditions for other actions
return redirect(prefixed_url_for('document_bp.documents'))
@document_bp.route('/edit_document/<int:document_id>', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def edit_document(document_id):
doc = Document.query.get_or_404(document_id)
form = EditDocumentForm(obj=doc)
if form.validate_on_submit():
doc.name = form.name.data
doc.valid_from = form.valid_from.data
doc.valid_to = form.valid_to.data
update_logging_information(doc, dt.now(tz.utc))
try:
db.session.add(doc)
db.session.commit()
flash(f'Document {doc.id} updated successfully', 'success')
except SQLAlchemyError as e:
db.session.rollback()
flash(f'Error updating document: {e}', 'danger')
current_app.logger.error(f'Error updating document: {e}')
else:
form_validation_failed(request, form)
return render_template('document/edit_document.html', form=form, document_id=document_id)
@document_bp.route('/document_languages/<int:document_id>', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def document_languages(document_id):
doc = Document.query.get_or_404(document_id)
doc_desc = f'Document {doc.id}: {doc.name}'
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
query = DocumentLanguage.query.filter_by(document_id=document_id).order_by(DocumentLanguage.language)
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
doc_langs = pagination.items
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('language', ''), ('user_context', ''),
('system_context', '')])
return render_template('document/document_languages.html', rows=rows, pagination=pagination, document=doc_desc)
@document_bp.route('/handle_document_language_selection', methods=['POST'])
@roles_accepted('Super User', 'Tenant Admin')
def handle_document_language_selection():
document_language_identification = request.form['selected_row']
doc_lang_id = ast.literal_eval(document_language_identification).get('value')
action = request.form['action']
match action:
case 'edit_document_language':
return redirect(prefixed_url_for('document_bp.edit_document_language', document_language_id=doc_lang_id))
case 'document_versions':
return redirect(prefixed_url_for('document_bp.document_versions', document_language_id=doc_lang_id))
# Add more conditions for other actions
return redirect(prefixed_url_for('document_bp.document_languages'))
@document_bp.route('/edit_document_language/<int:document_language_id>', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def edit_document_language(document_language_id):
doc_lang = DocumentLanguage.query.get_or_404(document_language_id)
form = EditDocumentLanguageForm(obj=doc_lang)
if form.validate_on_submit():
doc_lang.user_context = form.user_context.data
update_logging_information(doc_lang, dt.now(tz.utc))
try:
db.session.add(doc_lang)
db.session.commit()
flash(f'Document Language {doc_lang.id} updated successfully', 'success')
except SQLAlchemyError as e:
db.session.rollback()
flash(f'Error updating document language: {e}', 'danger')
current_app.logger.error(f'Error updating document language {doc_lang.id} '
f'for tenant {session['tenant']['id']}: {e}')
else:
form_validation_failed(request, form)
return render_template('document/edit_document_language.html', form=form, document_langauge_id=document_language_id,
doc_details=f'Document {doc_lang.document.name}')
@document_bp.route('/document_versions/<int:document_language_id>', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def document_versions(document_language_id):
flash(f'Processing documents is a long running process. Please be careful retriggering processing!', 'danger')
doc_lang = DocumentLanguage.query.get_or_404(document_language_id)
doc_desc = f'Document {doc_lang.document.name}, Language {doc_lang.language}'
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 10, type=int)
query = DocumentVersion.query.filter_by(doc_lang_id=document_language_id).order_by(desc(DocumentVersion.id))
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
doc_langs = pagination.items
rows = prepare_table_for_macro(doc_langs, [('id', ''), ('url', ''), ('file_location', ''),
('file_name', ''), ('file_type', ''),
('processing', ''), ('processing_started_at', ''),
('processing_finished_at', '')])
return render_template('document/document_versions.html', rows=rows, pagination=pagination, document=doc_desc)
@document_bp.route('/handle_document_version_selection', methods=['POST'])
@roles_accepted('Super User', 'Tenant Admin')
def handle_document_version_selection():
document_version_identification = request.form['selected_row']
doc_vers_id = ast.literal_eval(document_version_identification).get('value')
action = request.form['action']
match action:
case 'process_document_version':
process_version(doc_vers_id)
# Add more conditions for other actions
doc_vers = DocumentVersion.query.get_or_404(doc_vers_id)
return redirect(prefixed_url_for('document_bp.document_versions', document_language_id=doc_vers.doc_lang_id))
def process_version(version_id):
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
version_id,
])
current_app.logger.info(f'Embedding creation retriggered by user {current_user.id}, {current_user.email} '
f'for tenant {session["tenant"]["id"]}, '
f'Document Version {version_id}. '
f'Embedding creation task: {task.id}')
flash(f'Processing for document version {version_id} retriggered successfully...', 'success')
return redirect(prefixed_url_for('document_bp.documents'))
def set_logging_information(obj, timestamp): def set_logging_information(obj, timestamp):
@@ -136,6 +306,11 @@ def set_logging_information(obj, timestamp):
obj.updated_by = current_user.id obj.updated_by = current_user.id
def update_logging_information(obj, timestamp):
obj.updated_at = timestamp
obj.updated_by = current_user.id
def create_document_stack(form, file, filename, extension): def create_document_stack(form, file, filename, extension):
# Create the Document # Create the Document
new_doc = create_document(form, filename) new_doc = create_document(form, filename)
@@ -155,7 +330,7 @@ def create_document_stack(form, file, filename, extension):
db.session.commit() db.session.commit()
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}') current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
flash('Error adding document.', 'error') flash('Error adding document.', 'alert')
db.session.rollback() db.session.rollback()
error = e.args error = e.args
raise raise
@@ -277,25 +452,6 @@ def fetch_html(url):
return response.text return response.text
# Sample code for adding or updating versions and ensuring latest_version is set in DocumentLanguage
# def add_or_update_version(language_id, version_data):
# new_version = Version(language_id=language_id, **version_data)
# db.session.add(new_version)
# db.session.flush() # Ensures new_version gets an ID assigned if it's new
#
# # Assuming we always call this when we know it's the latest
# language = Language.query.get(language_id)
# language.latest_version_id = new_version.id
# db.session.commit()
# sample code for using latest_version in the application
# @app.route('/language/<int:language_id>')
# def show_language(language_id):
# language = Language.query.get_or_404(language_id)
# latest_version = language.latest_version # This is now a direct, efficient database access
# return render_template('language_details.html', language=language, latest_version=latest_version)
def prepare_document_data(docs): def prepare_document_data(docs):
rows = [] rows = []
for doc in docs: for doc in docs:
@@ -340,4 +496,3 @@ def prepare_document_data(docs):
'sub_rows': languages_rows}) 'sub_rows': languages_rows})
rows.append(doc_row) rows.append(doc_row)
return rows return rows

View File

@@ -23,6 +23,9 @@ def create_app(config_file=None):
from . import tasks from . import tasks
app.logger.info("EveAI Worker Server Started Successfully")
app.logger.info("-------------------------------------------------------------------------------------------------")
return app, celery return app, celery

View File

@@ -11,17 +11,17 @@ from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError from unstructured_client.models.errors import SDKError
# OpenAI imports # OpenAI imports
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.summarize import load_summarize_chain from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import CharacterTextSplitter
from langchain_core.exceptions import LangChainException from langchain_core.exceptions import LangChainException
from common.utils.database import Database from common.utils.database import Database
from common.models.document import DocumentVersion, EmbeddingMistral, EmbeddingSmallOpenAI from common.models.document import DocumentVersion
from common.models.user import Tenant from common.models.user import Tenant
from common.extensions import db from common.extensions import db
from common.utils.celery_utils import current_celery from common.utils.celery_utils import current_celery
from common.utils.model_utils import select_model_variables
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -35,59 +35,68 @@ def create_embeddings(tenant_id, document_version_id):
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.') current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.')
# Retrieve Tenant for which we are processing
tenant = Tenant.query.get(tenant_id)
if tenant is None:
current_app.logger.error(f'Cannot create embeddings for tenant {tenant_id}. '
f'Tenant not found')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
# Ensure we are working in the correct database schema
Database(tenant_id).switch_schema()
# Retrieve document version to process
document_version = DocumentVersion.query.get(document_version_id)
if document_version is None:
current_app.logger.error(f'Cannot create embeddings for tenant {tenant_id}. '
f'Document version {document_version_id} not found')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
db.session.add(document_version)
# start processing
document_version.processing = True
document_version.processing_started_at = dt.now(tz.utc)
try: try:
# Retrieve Tenant for which we are processing
tenant = Tenant.query.get(tenant_id)
if tenant is None:
raise Exception(f'Tenant {tenant_id} not found')
# Ensure we are working in the correct database schema
Database(tenant_id).switch_schema()
# Select variables to work with depending on tenant and model
model_variables = select_model_variables(tenant)
# Retrieve document version to process
document_version = DocumentVersion.query.get(document_version_id)
if document_version is None:
raise Exception(f'Document version {document_version_id} not found')
except Exception as e:
current_app.logger.error(f'Create Embeddings request received '
f'for non existing document version {document_version_id} '
f'for tenant {tenant_id}, '
f'error: {e}')
raise
try:
db.session.add(document_version)
# start processing
document_version.processing = True
document_version.processing_started_at = dt.now(tz.utc)
db.session.commit() db.session.commit()
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger.error(f'Error saving document version {document_version_id} to database ' current_app.logger.error(f'Unable to save Embedding status information '
f'for tenant {tenant_id} when starting creating of embeddings. ' f'in document version {document_version_id} '
f'for tenant {tenant_id}')
raise
try:
match document_version.file_type:
case 'pdf':
process_pdf(tenant, model_variables, document_version)
case 'html':
process_html(tenant, model_variables, document_version)
case _:
raise Exception(f'No functionality defined for file type {document_version.file_type} '
f'for tenant {tenant_id} '
f'while creating embeddings for document version {document_version_id}')
except Exception as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
f'on document version {document_version_id} '
f'error: {e}') f'error: {e}')
document_version.processing = False
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing_error = str(e)[:255]
db.session.commit()
create_embeddings.update_state(state=states.FAILURE) create_embeddings.update_state(state=states.FAILURE)
raise Ignore() raise
match document_version.file_type:
case 'pdf':
process_pdf(tenant, document_version)
case 'html':
process_html(tenant, document_version)
case _:
current_app.logger.info(f'No functionality defined for file type {document_version.file_type} '
f'for tenant {tenant_id} '
f'while creating embeddings for document version {document_version_id}')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
@current_celery.task(name='ask_eve_ai', queue='llm_interactions') def process_pdf(tenant, model_variables, document_version):
def ask_eve_ai(query):
# Interaction logic with LLMs like GPT (Langchain API calls, etc.)
pass
def process_pdf(tenant, document_version):
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location, document_version.file_location,
document_version.file_name) document_version.file_name)
@@ -101,15 +110,15 @@ def process_pdf(tenant, document_version):
coordinates=True, coordinates=True,
extract_image_block_types=['Image', 'Table'], extract_image_block_types=['Image', 'Table'],
chunking_strategy='by_title', chunking_strategy='by_title',
combine_under_n_chars=current_app.config.get('MIN_CHUNK_SIZE'), combine_under_n_chars=model_variables['min_chunk_size'],
max_characters=current_app.config.get('MAX_CHUNK_SIZE'), max_characters=model_variables['max_chunk_size'],
) )
else: else:
current_app.logger.error(f'The physical file for document version {document_version.id} ' current_app.logger.error(f'The physical file for document version {document_version.id} '
f'for tenant {tenant.id} ' f'for tenant {tenant.id} '
f'at {file_path} does not exist') f'at {file_path} does not exist')
create_embeddings.update_state(state=states.FAILURE) create_embeddings.update_state(state=states.FAILURE)
raise Ignore() raise
try: try:
chunks = partition_doc_unstructured(tenant, document_version, req) chunks = partition_doc_unstructured(tenant, document_version, req)
@@ -118,13 +127,13 @@ def process_pdf(tenant, document_version):
f'while processing PDF on document version {document_version.id} ' f'while processing PDF on document version {document_version.id} '
f'error: {e}') f'error: {e}')
create_embeddings.update_state(state=states.FAILURE) create_embeddings.update_state(state=states.FAILURE)
raise Ignore() raise
summary = summarize_chunk(tenant, document_version, chunks[0]) summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
doc_lang = document_version.document_language doc_lang = document_version.document_language
doc_lang.system_context = f'Summary: {summary}\n' doc_lang.system_context = f'Summary: {summary}\n'
enriched_chunks = enrich_chunks(tenant, document_version, chunks) enriched_chunks = enrich_chunks(tenant, document_version, chunks)
embeddings = embed_chunks(tenant, document_version, enriched_chunks) embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
try: try:
db.session.add(doc_lang) db.session.add(doc_lang)
@@ -138,13 +147,14 @@ def process_pdf(tenant, document_version):
f'on PDF, document version {document_version.id}' f'on PDF, document version {document_version.id}'
f'error: {e}') f'error: {e}')
db.session.rollback() db.session.rollback()
create_embeddings.update_state(state=states.FAILURE)
raise raise
current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} ' current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
f'on document version {document_version.id} :-)') f'on document version {document_version.id} :-)')
def process_html(tenant, document_version): def process_html(tenant, model_variables, document_version):
# The tags to be considered can be dependent on the tenant # The tags to be considered can be dependent on the tenant
html_tags = tenant.html_tags html_tags = tenant.html_tags
end_tags = tenant.html_end_tags end_tags = tenant.html_end_tags
@@ -163,22 +173,22 @@ def process_html(tenant, document_version):
f'for tenant {tenant.id} ' f'for tenant {tenant.id} '
f'at {file_path} does not exist') f'at {file_path} does not exist')
create_embeddings.update_state(state=states.FAILURE) create_embeddings.update_state(state=states.FAILURE)
raise Ignore() raise
extracted_data, title = parse_html(html_content, html_tags, included_elements=included_elements, extracted_data, title = parse_html(html_content, html_tags, included_elements=included_elements,
excluded_elements=excluded_elements) excluded_elements=excluded_elements)
potential_chunks = create_potential_chunks(extracted_data, end_tags) potential_chunks = create_potential_chunks(extracted_data, end_tags)
chunks = combine_chunks(potential_chunks, chunks = combine_chunks(potential_chunks,
current_app.config.get('MIN_CHUNK_SIZE'), model_variables['min_chunk_size'],
current_app.config.get('MAX_CHUNK_SIZE') model_variables['max_chunk_size']
) )
summary = summarize_chunk(tenant, document_version, chunks[0]) summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
doc_lang = document_version.document_language doc_lang = document_version.document_language
doc_lang.system_context = (f'Title: {title}\n' doc_lang.system_context = (f'Title: {title}\n'
f'Summary: {summary}\n') f'Summary: {summary}\n')
enriched_chunks = enrich_chunks(tenant, document_version, chunks) enriched_chunks = enrich_chunks(tenant, document_version, chunks)
embeddings = embed_chunks(tenant, document_version, enriched_chunks) embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
try: try:
db.session.add(doc_lang) db.session.add(doc_lang)
@@ -198,6 +208,8 @@ def process_html(tenant, document_version):
def enrich_chunks(tenant, document_version, chunks): def enrich_chunks(tenant, document_version, chunks):
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
doc_lang = document_version.document_language doc_lang = document_version.document_language
chunk_total_context = (f'Filename: {document_version.file_name}\n' chunk_total_context = (f'Filename: {document_version.file_name}\n'
f'{doc_lang.system_context}\n' f'{doc_lang.system_context}\n'
@@ -209,54 +221,36 @@ def enrich_chunks(tenant, document_version, chunks):
enriched_chunk = f'{chunk_total_context}\n{chunk}' enriched_chunk = f'{chunk_total_context}\n{chunk}'
enriched_chunks.append(enriched_chunk) enriched_chunks.append(enriched_chunk)
current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
return enriched_chunks return enriched_chunks
def summarize_chunk(tenant, document_version, chunk): def summarize_chunk(tenant, model_variables, document_version, chunk):
llm_model = tenant.llm_model current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
llm_provider = llm_model.split('.', 1)[0] f'on document version {document_version.id}')
llm_model = llm_model.split('.', 1)[1] llm = model_variables['llm']
prompt = model_variables['summary_prompt']
summary_template = ''
llm = None
match llm_provider:
case 'openai':
api_key = current_app.config.get('OPENAI_API_KEY')
llm = ChatOpenAI(api_key=api_key, temperature=0, model=llm_model)
match llm_model:
case 'gpt-4-turbo':
summary_template = current_app.config.get('GPT4_SUMMARY_TEMPLATE')
case 'gpt-3.5-turbo':
summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
case _:
current_app.logger.error(f'Error summarizing initial chunk for tenant {tenant.id} '
f'on document version {document_version.id} '
f'error: Invalid llm model')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
case _:
current_app.logger.error(f'Error summarizing initial chunk for tenant {tenant.id} '
f'on document version {document_version.id} '
f'error: Invalid llm provider')
prompt = ChatPromptTemplate.from_template(summary_template)
chain = load_summarize_chain(llm, chain_type='stuff', prompt=prompt) chain = load_summarize_chain(llm, chain_type='stuff', prompt=prompt)
doc_creator = CharacterTextSplitter(chunk_size=current_app.config.get('MAX_CHUNK_SIZE') * 2, chunk_overlap=0) doc_creator = CharacterTextSplitter(chunk_size=model_variables['max_chunk_size'] * 2, chunk_overlap=0)
text_to_summarize = doc_creator.create_documents(chunk) text_to_summarize = doc_creator.create_documents(chunk)
try: try:
summary = chain.run(text_to_summarize) summary = chain.run(text_to_summarize)
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}.')
return summary
except LangChainException as e: except LangChainException as e:
current_app.logger.error(f'Error creating summary for chunk enrichment for tenant {tenant.id} ' current_app.logger.error(f'Error creating summary for chunk enrichment for tenant {tenant.id} '
f'on document version {document_version.id} ' f'on document version {document_version.id} '
f'error: {e}') f'error: {e}')
raise raise
return summary
def partition_doc_unstructured(tenant, document_version, unstructured_request): def partition_doc_unstructured(tenant, document_version, unstructured_request):
current_app.logger.debug(f'Partitioning document version {document_version.id} for tenant {tenant.id}')
# Initiate the connection to unstructured.io # Initiate the connection to unstructured.io
url = current_app.config.get('UNSTRUCTURED_FULL_URL') url = current_app.config.get('UNSTRUCTURED_FULL_URL')
api_key = current_app.config.get('UNSTRUCTURED_API_KEY') api_key = current_app.config.get('UNSTRUCTURED_API_KEY')
@@ -273,6 +267,7 @@ def partition_doc_unstructured(tenant, document_version, unstructured_request):
pass pass
case 'Table': case 'Table':
chunks.append(el['metadata']['text_as_html']) chunks.append(el['metadata']['text_as_html'])
current_app.logger.debug(f'Finished partioning document version {document_version.id} for tenant {tenant.id}')
return chunks return chunks
except SDKError as e: except SDKError as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} ' current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
@@ -281,33 +276,15 @@ def partition_doc_unstructured(tenant, document_version, unstructured_request):
raise raise
def embed_chunks(tenant, document_version, chunks): def embed_chunks(tenant, model_variables, document_version, chunks):
embedding_provider = tenant.embedding_model.rsplit('.', 1)[0] current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
embedding_model = tenant.embedding_model.rsplit('.', 1)[1] f'on document version {document_version.id}')
embedding_model = model_variables['embedding_model']
match embedding_provider:
case 'openai':
match embedding_model:
case 'text-embedding-3-small':
return embed_chunks_for_text_embedding_3_small(tenant, document_version, chunks)
case _:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} '
f'error: Invalid embedding model')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
case _:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} '
f'error: Invalid embedding provider')
def embed_chunks_for_text_embedding_3_small(tenant, document_version, chunks):
# Create embedding vectors using OpenAI
api_key = current_app.config.get('OPENAI_API_KEY')
embeddings_model = OpenAIEmbeddings(api_key=api_key, model='text-embedding-3-small')
try: try:
embeddings = embeddings_model.embed_documents(chunks) embeddings = embedding_model.embed_documents(chunks)
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
except LangChainException as e: except LangChainException as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} ' current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} while calling OpenAI API' f'on document version {document_version.id} while calling OpenAI API'
@@ -317,7 +294,7 @@ def embed_chunks_for_text_embedding_3_small(tenant, document_version, chunks):
# Add embeddings to the database # Add embeddings to the database
new_embeddings = [] new_embeddings = []
for chunk, embedding in zip(chunks, embeddings): for chunk, embedding in zip(chunks, embeddings):
new_embedding = EmbeddingSmallOpenAI() new_embedding = model_variables['embedding_db_model']()
new_embedding.document_version = document_version new_embedding.document_version = document_version
new_embedding.active = True new_embedding.active = True
new_embedding.chunk = chunk new_embedding.chunk = chunk
@@ -327,10 +304,6 @@ def embed_chunks_for_text_embedding_3_small(tenant, document_version, chunks):
return new_embeddings return new_embeddings
def embed_chunks_for_mistral_embed(tenant_id, document_version, chunks):
pass
def parse_html(html_content, tags, included_elements=None, excluded_elements=None): def parse_html(html_content, tags, included_elements=None, excluded_elements=None):
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
extracted_content = [] extracted_content = []