From 011bdce38d1452b3605fb3c4873f05849016bd11 Mon Sep 17 00:00:00 2001 From: Josako Date: Sun, 12 May 2024 21:58:42 +0200 Subject: [PATCH] Prepare for html document validation (added wanted tags to tenant) --- common/models/user.py | 3 + common/utils/database.py | 8 +- eveai_app/views/document_views.py | 2 +- eveai_app/views/user_forms.py | 4 + eveai_workers/tasks.py | 171 +++++++++++++++++++++++------- requirements.txt | 3 +- 6 files changed, 146 insertions(+), 45 deletions(-) diff --git a/common/models/user.py b/common/models/user.py index 86513ab..70a3d2d 100644 --- a/common/models/user.py +++ b/common/models/user.py @@ -29,6 +29,9 @@ class Tenant(db.Model): default_llm_model = db.Column(db.String(50), nullable=True) allowed_llm_models = db.Column(ARRAY(sa.String(50)), nullable=True) + # Embedding variables + html_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']) + # Licensing Information license_start_date = db.Column(db.Date, nullable=True) license_end_date = db.Column(db.Date, nullable=True) diff --git a/common/utils/database.py b/common/utils/database.py index 1cf95bc..c7e40fc 100644 --- a/common/utils/database.py +++ b/common/utils/database.py @@ -4,6 +4,7 @@ from sqlalchemy import text from sqlalchemy.schema import CreateSchema from sqlalchemy.exc import InternalError from sqlalchemy.orm import sessionmaker, scoped_session +from sqlalchemy.exc import SQLAlchemyError from flask import current_app from common.extensions import db, migrate @@ -35,9 +36,10 @@ class Database: """create new database schema, mostly used on tenant creation""" try: db.session.execute(CreateSchema(self.schema)) - db.session.execute(text(f"CREATE EXTENSION IF NOT EXISTS pgvector SCHEMA {self.schema}")) + # db.session.commit() + db.session.execute(text(f"SET search_path TO {self.schema}, public")) db.session.commit() - except InternalError as e: + except SQLAlchemyError as e: db.session.rollback() db.session.close() current_app.logger.error(f"Error creating schema {self.schema}: {e.args}") @@ -48,7 +50,7 @@ class Database: def switch_schema(self): """switch between tenant/public database schema""" - db.session.execute(text(f'set search_path to "{self.schema}"')) + db.session.execute(text(f'set search_path to "{self.schema}", public')) db.session.commit() def migrate_tenant_schema(self): diff --git a/eveai_app/views/document_views.py b/eveai_app/views/document_views.py index b68daa3..b105957 100644 --- a/eveai_app/views/document_views.py +++ b/eveai_app/views/document_views.py @@ -40,7 +40,7 @@ def add_document(): create_document_stack(form, file, filename, extension) - return redirect(url_for('document_bp/documents')) + return redirect(url_for('document_bp.documents')) return render_template('document/add_document.html', form=form) diff --git a/eveai_app/views/user_forms.py b/eveai_app/views/user_forms.py index 302c557..4a7918c 100644 --- a/eveai_app/views/user_forms.py +++ b/eveai_app/views/user_forms.py @@ -21,6 +21,10 @@ class TenantForm(FlaskForm): license_start_date = DateField('License Start Date', id='form-control datepicker', validators=[Optional()]) license_end_date = DateField('License End Date', id='form-control datepicker', validators=[Optional()]) allowed_monthly_interactions = IntegerField('Allowed Monthly Interactions', validators=[NumberRange(min=0)]) + # Embedding variables + html_tags = StringField('HTML Tags', validators=[DataRequired(), Length(max=255)], + default='p, h1, h2, h3, h4, h5, h6, li') + submit = SubmitField('Submit') def __init__(self, *args, **kwargs): diff --git a/eveai_workers/tasks.py b/eveai_workers/tasks.py index a655d2b..9f04823 100644 --- a/eveai_workers/tasks.py +++ b/eveai_workers/tasks.py @@ -1,5 +1,7 @@ from datetime import datetime as dt, timezone as tz from flask import current_app +from sqlalchemy.exc import SQLAlchemyError + import os # Unstructured commercial client imports @@ -19,6 +21,8 @@ from common.models.document import DocumentVersion, EmbeddingMistral, EmbeddingS from common.extensions import db from common.utils.celery_utils import current_celery +from bs4 import BeautifulSoup + @current_celery.task(name='create_embeddings', queue='embeddings') def create_embeddings(tenant_id, document_version_id, default_embedding_model): @@ -44,11 +48,18 @@ def create_embeddings(tenant_id, document_version_id, default_embedding_model): # start processing document_version.processing = True document_version.processing_started_at = dt.now(tz.utc) - db.session.commit() + try: + db.session.commit() + except SQLAlchemyError as e: + current_app.logger.error(f'Error saving document version {document_version_id} to database ' + f'for tenant {tenant_id} when creating embeddings. ' + f'error: {e}') + return embed_provider = default_embedding_model.rsplit('.', 1)[0] embed_model = default_embedding_model.rsplit('.', 1)[1] # define embedding variables + embedding_function = None match (embed_provider, embed_model): case ('openai', 'text-embedding-3-small'): embedding_function = embed_chunks_for_text_embedding_3_small @@ -57,37 +68,13 @@ def create_embeddings(tenant_id, document_version_id, default_embedding_model): match document_version.file_type: case 'pdf': - file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], - document_version.file_location, - document_version.file_name) - if os.path.exists(file_path): - with open(file_path, 'rb') as f: - files = shared.Files(content=f.read(), file_name=document_version.file_name) - req = shared.PartitionParameters( - files=files, - strategy='hi_res', - hi_res_model_name='yolox', - coordinates=True, - extract_image_block_types=['Image', 'Table'], - chunking_strategy='by_title', - combine_under_n_chars=2000, - max_characters=3000, - ) - try: - chunks = partition_doc_unstructured(tenant_id, document_version, req) - enriched_chunk_docs = enrich_chunks(tenant_id, document_version, chunks) - embedding_function(tenant_id, document_version, enriched_chunk_docs) - except Exception as e: - current_app.logger.error(f'Unable to create Embeddings for tenant {tenant_id} ' - f'on document version {document_version.id} ' - f'with model {default_embedding_model} ' - f'error: {e}') - return - - else: # file exists - current_app.logger.error(f'The physical file for document version {document_version_id} ' - f'at {file_path} does not exist') - return + process_pdf(tenant_id, document_version, embedding_function, default_embedding_model) + case 'html': + process_html(tenant_id, document_version, embedding_function, default_embedding_model) + case _: + current_app.logger.info(f'No functionality defined for file type {document_version.file_type} ' + f'for tenant {tenant_id} ' + f'while creating embeddings for document version {document_version_id}') @current_celery.task(name='ask_eve_ai', queue='llm_interactions') @@ -96,6 +83,67 @@ def ask_eve_ai(query): pass +def process_pdf(tenant_id, document_version, embedding_function, embedding_model): + file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], + document_version.file_location, + document_version.file_name) + if os.path.exists(file_path): + with open(file_path, 'rb') as f: + files = shared.Files(content=f.read(), file_name=document_version.file_name) + req = shared.PartitionParameters( + files=files, + strategy='hi_res', + hi_res_model_name='yolox', + coordinates=True, + extract_image_block_types=['Image', 'Table'], + chunking_strategy='by_title', + combine_under_n_chars=2000, + max_characters=3000, + ) + try: + chunks = partition_doc_unstructured(tenant_id, document_version, req) + enriched_chunk_docs = enrich_chunks(tenant_id, document_version, chunks) + embeddings = embedding_function(tenant_id, document_version, enriched_chunk_docs) + except Exception as e: + current_app.logger.error(f'Unable to create Embeddings for tenant {tenant_id} ' + f'on document version {document_version.id} ' + f'with model {embedding_model} ' + f'error: {e}') + raise + + # Save embeddings & processing information to the database + db.session.add_all(embeddings) + db.session.add(document_version) + document_version.processing_finished_at = dt.now(tz.utc) + document_version.processing = False + + try: + db.session.commit() + except SQLAlchemyError as e: + current_app.logger.error(f'Error saving embedding information for tenant {tenant_id} ' + f'on document version {document_version.id}' + f'error: {e}') + db.session.rollback() + raise + + current_app.logger.info(f'Embeddings created successfully for tenant {tenant_id} ' + f'on document version {document_version.id} :-)') + else: # file exists + current_app.logger.error(f'The physical file for document version {document_version.id} ' + f'at {file_path} does not exist') + raise + + +def process_html(tenant_id, document_version, embedding_function, default_embedding_model): + file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], + document_version.file_location, + document_version.file_name) + if os.path.exists(file_path): + with open(file_path, 'rb') as f: + html_content = f.read() + + + def enrich_chunks(tenant_id, document_version, chunks): # We're adding filename and a summary of the first chunk to all the chunks to create global context # using openAI to summarise @@ -109,15 +157,31 @@ def enrich_chunks(tenant_id, document_version, chunks): chain = load_summarize_chain(llm, chain_type='stuff', prompt=prompt) doc_creator = CharacterTextSplitter(chunk_size=9000, chunk_overlap=0) - text_to_summarize = doc_creator.create_documents(chunks[0]['text']) + text_to_summarize = doc_creator.create_documents(chunks[0]) try: summary = chain.run(text_to_summarize) - chunk_global_context = f'Filename: {document_version.file_name}\nSummary:\n {summary}' + doc_lang = document_version.document_language + db.session.add(doc_lang) + doc_lang.system_context = f'Summary:\n {summary}' + try: + db.session.commit() + except SQLAlchemyError as e: + current_app.logger. error(f'Error saving summary to DocumentLanguage {doc_lang.id} ' + f'while enriching chunks for tenant {tenant_id} ' + f'on document version {document_version.id} ' + f'error: {e}') + db.session.rollback() + raise + + chunk_global_context = (f'Filename: {doc_lang.document.name}\n' + f'User Context:\n{doc_lang.user_context}' + f'System Context:\n{summary}') enriched_chunks = [] + initial_chunk = f'Filename: {document_version.file_name}\n User Context:\n{doc_lang.user_context}\n{chunks[0]}' + enriched_chunks.append(initial_chunk) for chunk in chunks[1:]: - enriched_chunk_raw = f'{chunk_global_context}\n{chunk}' - enriched_chunk_doc = doc_creator.create_documents([enriched_chunk_raw]) - enriched_chunks.append(enriched_chunk_doc) + enriched_chunk = f'{chunk_global_context}\n{chunk}' + enriched_chunks.append(enriched_chunk) return enriched_chunks @@ -139,7 +203,7 @@ def partition_doc_unstructured(tenant_id, document_version, unstructured_request chunks = [] for el in res.elements: match el['type']: - case 'Composite_element': + case 'CompositeElement': chunks.append(el['text']) case 'Image': pass @@ -165,11 +229,38 @@ def embed_chunks_for_text_embedding_3_small(tenant_id, document_version, chunks) f'error: {e}') raise + # Add embeddings to the database + new_embeddings = [] for chunk, embedding in zip(chunks, embeddings): new_embedding = EmbeddingSmallOpenAI() - # TODO: continue here - return embeddings + new_embedding.document_version = document_version + new_embedding.active = True + new_embedding.chunk = chunk + new_embedding.embedding = embedding + new_embeddings.append(new_embedding) + + return new_embeddings def embed_chunks_for_mistral_embed(tenant_id, document_version, chunks): pass + + +def parse_html(html_content, included_elements=None, excluded_elements=None): + soup = BeautifulSoup(html_content, 'html.parser') + extracted_content = [] + + if included_elements: + elements_to_parse = soup.find_all(included_elements) + else: + elements_to_parse = [soup] # parse the entire document if no included_elements specified + + # Iterate through the found included elements + for element in elements_to_parse: + # Find all specified tags within each included element + for sub_element in element.find_all(tags): + if excluded_elements and sub_element.find_parent(excluded_elements): + continue # Skip this sub_element if it's within any of the excluded_elements + extracted_content.append((sub_element.name, sub_element.get_text(strip=True))) + + return extracted_content diff --git a/requirements.txt b/requirements.txt index 6ec3822..05feb11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ gevent~=24.2.1 celery~=5.4.0 kombu~=5.3.7 langchain~=0.1.17 -requests~=2.31.0 \ No newline at end of file +requests~=2.31.0 +beautifulsoup4~=4.12.3 \ No newline at end of file