- Introduction of dynamic Retrievers & Specialists
- Introduction of dynamic Processors - Introduction of caching system - Introduction of a better template manager - Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists - Start adaptation of chat client
This commit is contained in:
@@ -10,22 +10,20 @@ from langchain_core.exceptions import LangChainException
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
from common.extensions import db, minio_client
|
||||
from common.models.document import DocumentVersion, Embedding, Document
|
||||
from common.extensions import db, minio_client, template_manager
|
||||
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
|
||||
from common.models.user import Tenant
|
||||
from common.utils.celery_utils import current_celery
|
||||
from common.utils.database import Database
|
||||
from common.utils.model_utils import select_model_variables, create_language_template
|
||||
from common.utils.os_utils import safe_remove, sync_folder
|
||||
from eveai_workers.Processors.audio_processor import AudioProcessor
|
||||
from eveai_workers.Processors.html_processor import HTMLProcessor
|
||||
from eveai_workers.Processors.pdf_processor import PDFProcessor
|
||||
from eveai_workers.Processors.srt_processor import SRTProcessor
|
||||
from common.utils.model_utils import create_language_template, get_model_variables
|
||||
|
||||
from common.utils.business_event import BusinessEvent
|
||||
from common.utils.business_event_context import current_event
|
||||
from config.processor_types import PROCESSOR_TYPES
|
||||
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
||||
|
||||
|
||||
# Healthcheck task
|
||||
@@ -53,14 +51,18 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
# Retrieve the Catalog ID
|
||||
doc = Document.query.get_or_404(document_version.doc_id)
|
||||
catalog_id = doc.catalog_id
|
||||
catalog = Catalog.query.get_or_404(catalog_id)
|
||||
|
||||
# Select variables to work with depending on tenant and model
|
||||
model_variables = select_model_variables(tenant, catalog_id=catalog_id)
|
||||
current_app.logger.debug(f'Model variables: {model_variables}')
|
||||
model_variables = get_model_variables(tenant_id)
|
||||
|
||||
# Define processor related information
|
||||
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
|
||||
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Create Embeddings request received '
|
||||
f'for non existing document version {document_version_id} '
|
||||
f'for badly configured document version {document_version_id} '
|
||||
f'for tenant {tenant_id}, '
|
||||
f'error: {e}')
|
||||
raise
|
||||
@@ -90,19 +92,19 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
delete_embeddings_for_document_version(document_version)
|
||||
|
||||
try:
|
||||
match document_version.file_type:
|
||||
case 'pdf':
|
||||
process_pdf(tenant, model_variables, document_version)
|
||||
case 'html':
|
||||
process_html(tenant, model_variables, document_version)
|
||||
case 'srt':
|
||||
process_srt(tenant, model_variables, document_version)
|
||||
case 'mp4' | 'mp3' | 'ogg':
|
||||
process_audio(tenant, model_variables, document_version)
|
||||
case _:
|
||||
raise Exception(f'No functionality defined for file type {document_version.file_type} '
|
||||
f'for tenant {tenant_id} '
|
||||
f'while creating embeddings for document version {document_version_id}')
|
||||
with current_event.create_span(f"{processor_type} Processing"):
|
||||
document_processor = processor_class(
|
||||
tenant=tenant,
|
||||
model_variables=model_variables,
|
||||
document_version=document_version,
|
||||
catalog=catalog,
|
||||
processor=processor
|
||||
)
|
||||
markdown, title = document_processor.process()
|
||||
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
|
||||
|
||||
current_event.log("Finished Embedding Creation Task")
|
||||
|
||||
except Exception as e:
|
||||
@@ -129,53 +131,12 @@ def delete_embeddings_for_document_version(document_version):
|
||||
raise
|
||||
|
||||
|
||||
def process_pdf(tenant, model_variables, document_version):
|
||||
with current_event.create_span("PDF Processing"):
|
||||
processor = PDFProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def process_html(tenant, model_variables, document_version):
|
||||
with current_event.create_span("HTML Processing"):
|
||||
processor = HTMLProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def process_audio(tenant, model_variables, document_version):
|
||||
with current_event.create_span("Audio Processing"):
|
||||
processor = AudioProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def process_srt(tenant, model_variables, document_version):
|
||||
with current_event.create_span("SRT Processing"):
|
||||
processor = SRTProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def embed_markdown(tenant, model_variables, document_version, markdown, title):
|
||||
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
|
||||
# Create potential chunks
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
|
||||
|
||||
# Combine chunks for embedding
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
|
||||
|
||||
# Enrich chunks
|
||||
with current_event.create_span("Enrich Chunks"):
|
||||
@@ -203,9 +164,6 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):
|
||||
|
||||
|
||||
def enrich_chunks(tenant, model_variables, document_version, title, chunks):
|
||||
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
|
||||
summary = ''
|
||||
if len(chunks) > 1:
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
@@ -233,18 +191,13 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
|
||||
enriched_chunk = f'{chunk_total_context}\n{chunk}'
|
||||
enriched_chunks.append(enriched_chunk)
|
||||
|
||||
current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
|
||||
return enriched_chunks
|
||||
|
||||
|
||||
def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
current_event.log("Starting Summarizing Chunk")
|
||||
current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
llm = model_variables['llm']
|
||||
template = model_variables['summary_template']
|
||||
llm = model_variables.get_llm()
|
||||
template = model_variables.get_template("summary")
|
||||
language_template = create_language_template(template, document_version.language)
|
||||
summary_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
@@ -253,11 +206,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
chain = setup | summary_prompt | llm | output_parser
|
||||
|
||||
try:
|
||||
current_app.logger.debug(f'Starting summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
summary = chain.invoke({"text": chunk})
|
||||
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}.')
|
||||
current_event.log("Finished Summarizing Chunk")
|
||||
return summary
|
||||
except LangChainException as e:
|
||||
@@ -268,14 +217,10 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
|
||||
|
||||
def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
embedding_model = model_variables['embedding_model']
|
||||
embedding_model = model_variables.embedding_model
|
||||
|
||||
try:
|
||||
embeddings = embedding_model.embed_documents(chunks)
|
||||
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
except LangChainException as e:
|
||||
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
|
||||
f'on document version {document_version.id} while calling OpenAI API'
|
||||
@@ -285,28 +230,16 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
# Add embeddings to the database
|
||||
new_embeddings = []
|
||||
for chunk, embedding in zip(chunks, embeddings):
|
||||
new_embedding = model_variables['embedding_db_model']()
|
||||
new_embedding = model_variables.embedding_model_class()
|
||||
new_embedding.document_version = document_version
|
||||
new_embedding.active = True
|
||||
new_embedding.chunk = chunk
|
||||
new_embedding.embedding = embedding
|
||||
new_embeddings.append(new_embedding)
|
||||
|
||||
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} ')
|
||||
|
||||
return new_embeddings
|
||||
|
||||
|
||||
def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
|
||||
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
|
||||
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
|
||||
|
||||
|
||||
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
|
||||
try:
|
||||
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
|
||||
@@ -328,7 +261,6 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||
|
||||
current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
|
||||
return potential_chunks
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
|
||||
@@ -361,3 +293,69 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
actual_chunks.append(current_chunk)
|
||||
|
||||
return actual_chunks
|
||||
|
||||
|
||||
def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: str = None) -> Processor:
|
||||
"""
|
||||
Get the appropriate processor for a document based on catalog_id, file_type and optional sub_file_type.
|
||||
|
||||
Args:
|
||||
catalog_id: ID of the catalog
|
||||
file_type: Type of file (e.g., 'pdf', 'html')
|
||||
sub_file_type: Optional sub-type for specialized processing
|
||||
|
||||
Returns:
|
||||
Processor instance
|
||||
|
||||
Raises:
|
||||
ValueError: If no matching processor is found
|
||||
"""
|
||||
try:
|
||||
# Start with base query for catalog
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id)
|
||||
|
||||
# Find processor type that handles this file type
|
||||
matching_processor_type = None
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
|
||||
if file_type in supported_types:
|
||||
matching_processor_type = proc_type
|
||||
break
|
||||
|
||||
if not matching_processor_type:
|
||||
raise ValueError(f"No processor type found for file type: {file_type}")
|
||||
|
||||
# Add processor type condition
|
||||
query = query.filter_by(type=matching_processor_type)
|
||||
|
||||
# If sub_file_type is provided, add that condition
|
||||
if sub_file_type:
|
||||
query = query.filter_by(sub_file_type=sub_file_type)
|
||||
else:
|
||||
# If no sub_file_type, prefer processors without sub_file_type specification
|
||||
query = query.filter(or_(Processor.sub_file_type.is_(None),
|
||||
Processor.sub_file_type == ''))
|
||||
|
||||
# Get the first matching processor
|
||||
processor = query.first()
|
||||
|
||||
if not processor:
|
||||
if sub_file_type:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
|
||||
f"file type {file_type}, sub-type {sub_file_type}"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id}, "
|
||||
f"file type {file_type}"
|
||||
)
|
||||
|
||||
return processor
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Error finding processor: {str(e)}")
|
||||
raise
|
||||
|
||||
Reference in New Issue
Block a user