Start working on chunking en embedding task. Continu with embeddings.

This commit is contained in:
Josako
2024-05-08 22:40:55 +02:00
parent 667d99daa8
commit a4bf837d67
2 changed files with 135 additions and 24 deletions

View File

@@ -51,11 +51,16 @@ class Config(object):
CELERY_TIMEZONE = 'UTC' CELERY_TIMEZONE = 'UTC'
CELERY_ENABLE_UTC = True CELERY_ENABLE_UTC = True
# LLM TEMPLATES
GPT4_SUMMARY_TEMPLATE = """Summarise the text in the same language as the provided text between triple backquotes.
```{context}```"""
class DevConfig(Config): class DevConfig(Config):
DEVELOPMENT = True DEVELOPMENT = True
DEBUG = True DEBUG = True
FLASK_DEBUG = True FLASK_DEBUG = True
PYCHARM_DEBUG = True
SQLALCHEMY_DATABASE_URI = 'postgresql+pg8000://josako@localhost:5432/eveAI' SQLALCHEMY_DATABASE_URI = 'postgresql+pg8000://josako@localhost:5432/eveAI'
SQLALCHEMY_BINDS = {'public': 'postgresql+pg8000://josako@localhost:5432/eveAI'} SQLALCHEMY_BINDS = {'public': 'postgresql+pg8000://josako@localhost:5432/eveAI'}
EXPLAIN_TEMPLATE_LOADING = False EXPLAIN_TEMPLATE_LOADING = False

View File

@@ -1,8 +1,19 @@
from datetime import datetime as dt, timezone as tz from datetime import datetime as dt, timezone as tz
from langchain_community.document_loaders.unstructured import UnstructuredAPIFileLoader
from flask import current_app from flask import current_app
import os import os
# Unstructured commercial client imports
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
# OpenAI imports
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.exceptions import LangChainException
from common.utils.database import Database from common.utils.database import Database
from common.models.document import DocumentVersion, EmbeddingMistral, EmbeddingSmallOpenAI from common.models.document import DocumentVersion, EmbeddingMistral, EmbeddingSmallOpenAI
from common.extensions import db from common.extensions import db
@@ -11,8 +22,11 @@ from common.utils.celery_utils import current_celery
@current_celery.task(name='create_embeddings', queue='embeddings') @current_celery.task(name='create_embeddings', queue='embeddings')
def create_embeddings(tenant_id, document_version_id, default_embedding_model): def create_embeddings(tenant_id, document_version_id, default_embedding_model):
import pydevd_pycharm # Setup Remote Debugging only if PYCHARM_DEBUG=True
pydevd_pycharm.settrace('localhost', port=50170, stdoutToServer=True, stderrToServer=True) if current_app.config['PYCHARM_DEBUG']:
import pydevd_pycharm
pydevd_pycharm.settrace('localhost', port=50170, stdoutToServer=True, stderrToServer=True)
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id} ' current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id} '
f'with model {default_embedding_model}') f'with model {default_embedding_model}')
@@ -32,38 +46,130 @@ def create_embeddings(tenant_id, document_version_id, default_embedding_model):
document_version.processing_started_at = dt.now(tz.utc) document_version.processing_started_at = dt.now(tz.utc)
db.session.commit() db.session.commit()
embedding_provider = default_embedding_model.rsplit('.', 1)[0] embed_provider = default_embedding_model.rsplit('.', 1)[0]
embedding_model = default_embedding_model.rsplit('.', 1)[1] embed_model = default_embedding_model.rsplit('.', 1)[1]
# define embedding variables # define embedding variables
match (embedding_provider, embedding_model): match (embed_provider, embed_model):
case ('openai', 'text-embedding-3-small'): case ('openai', 'text-embedding-3-small'):
embedding_model = EmbeddingSmallOpenAI() embedding_function = embed_chunks_for_text_embedding_3_small
case ('mistral', 'text-embedding-3-small'): case ('mistral', 'mistral.mistral-embed'):
embedding_model = EmbeddingMistral() embedding_function = embed_chunks_for_mistral_embed
match document_version.file_type: match document_version.file_type:
case 'pdf': case 'pdf':
url = current_app.config.get('UNSTRUCTURED_FULL_URL')
api_key = current_app.config.get('UNSTRUCTURED_API_KEY')
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location, document_version.file_location,
document_version.file_name) document_version.file_name)
with open(file_path, 'rb') as f: if os.path.exists(file_path):
loader = UnstructuredAPIFileLoader(f, with open(file_path, 'rb') as f:
url=url, files = shared.Files(content=f.read(), file_name=document_version.file_name)
api_key=api_key, req = shared.PartitionParameters(
mode='elements', files=files,
strategy='hi-res', strategy='hi_res',
include_page_breaks=True, hi_res_model_name='yolox',
unique_element_ids=True, coordinates=True,
chunking_strategy='by_title', extract_image_block_types=['Image', 'Table'],
max_characters=3000, chunking_strategy='by_title',
) combine_under_n_chars=2000,
documents = loader.load() max_characters=3000,
print(documents) )
try:
chunks = partition_doc_unstructured(tenant_id, document_version, req)
enriched_chunk_docs = enrich_chunks(tenant_id, document_version, chunks)
embedding_function(tenant_id, document_version, enriched_chunk_docs)
except Exception as e:
current_app.logger.error(f'Unable to create Embeddings for tenant {tenant_id} '
f'on document version {document_version.id} '
f'with model {default_embedding_model} '
f'error: {e}')
return
else: # file exists
current_app.logger.error(f'The physical file for document version {document_version_id} '
f'at {file_path} does not exist')
return
@current_celery.task(name='ask_eve_ai', queue='llm_interactions') @current_celery.task(name='ask_eve_ai', queue='llm_interactions')
def ask_eve_ai(query): def ask_eve_ai(query):
# Interaction logic with LLMs like GPT (Langchain API calls, etc.) # Interaction logic with LLMs like GPT (Langchain API calls, etc.)
pass pass
def enrich_chunks(tenant_id, document_version, chunks):
# We're adding filename and a summary of the first chunk to all the chunks to create global context
# using openAI to summarise
api_key = current_app.config.get('OPENAI_API_KEY')
# TODO: model selection to be adapted to model approach
llm = ChatOpenAI(api_key=api_key, temperature=0, model='gpt-4-turbo')
summary_template = current_app.config.get('GPT4_SUMMARY_TEMPLATE')
prompt = ChatPromptTemplate.from_template(summary_template)
chain = load_summarize_chain(llm, chain_type='stuff', prompt=prompt)
doc_creator = CharacterTextSplitter(chunk_size=9000, chunk_overlap=0)
text_to_summarize = doc_creator.create_documents(chunks[0]['text'])
try:
summary = chain.run(text_to_summarize)
chunk_global_context = f'Filename: {document_version.file_name}\nSummary:\n {summary}'
enriched_chunks = []
for chunk in chunks[1:]:
enriched_chunk_raw = f'{chunk_global_context}\n{chunk}'
enriched_chunk_doc = doc_creator.create_documents([enriched_chunk_raw])
enriched_chunks.append(enriched_chunk_doc)
return enriched_chunks
except LangChainException as e:
current_app.logger.error(f'Error creating summary for chunk enrichment for tenant {tenant_id} '
f'on document version {document_version.id} '
f'error: {e}')
raise
def partition_doc_unstructured(tenant_id, document_version, unstructured_request):
# Initiate the connection to unstructured.io
url = current_app.config.get('UNSTRUCTURED_FULL_URL')
api_key = current_app.config.get('UNSTRUCTURED_API_KEY')
unstructured_client = UnstructuredClient(server_url=url, api_key_auth=api_key)
try:
res = unstructured_client.general.partition(unstructured_request)
chunks = []
for el in res.elements:
match el['type']:
case 'Composite_element':
chunks.append(el['text'])
case 'Image':
pass
case 'Table':
chunks.append(el['metadata']['text_as_html'])
return chunks
except SDKError as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
f'on document version {document_version.id} while chuncking'
f'error: {e}')
raise
def embed_chunks_for_text_embedding_3_small(tenant_id, document_version, chunks):
# Create embedding vectors using OpenAI
api_key = current_app.config.get('OPENAI_API_KEY')
embeddings_model = OpenAIEmbeddings(api_key=api_key, model='text-embedding-3-small')
try:
embeddings = embeddings_model.embed_documents(chunks)
except LangChainException as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
f'on document version {document_version.id} while calling OpenAI API'
f'error: {e}')
raise
for chunk, embedding in zip(chunks, embeddings):
new_embedding = EmbeddingSmallOpenAI()
# TODO: continue here
return embeddings
def embed_chunks_for_mistral_embed(tenant_id, document_version, chunks):
pass