- Introduction of dynamic Retrievers & Specialists

- Introduction of dynamic Processors
- Introduction of caching system
- Introduction of a better template manager
- Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists
- Start adaptation of chat client
This commit is contained in:
Josako
2024-11-15 10:00:53 +01:00
parent 55a8a95f79
commit 1807435339
101 changed files with 4181 additions and 1764 deletions

View File

@@ -4,10 +4,12 @@ from flask import Flask
import os
from common.utils.celery_utils import make_celery, init_celery
from common.extensions import db, minio_client
from config.logging_config import LOGGING
from common.extensions import db, minio_client, template_manager, cache_manager
import config.logging_config as logging_config
from config.config import get_config
from . import processors
def create_app(config_file=None):
app = Flask(__name__)
@@ -22,8 +24,7 @@ def create_app(config_file=None):
case _:
app.config.from_object(get_config('dev'))
logging.config.dictConfig(LOGGING)
app.embed_tuning_logger = logging.getLogger('embed_tuning')
logging.config.dictConfig(logging_config.LOGGING)
register_extensions(app)
@@ -41,6 +42,8 @@ def create_app(config_file=None):
def register_extensions(app):
db.init_app(app)
minio_client.init_app(app)
cache_manager.init_app(app)
template_manager.init_app(app)
app, celery = create_app()

View File

@@ -0,0 +1,5 @@
# Import all processor implementations to ensure registration
from . import audio_processor, html_processor, pdf_processor
# List of all available processor implementations
__all__ = ['audio_processor', 'html_processor', 'pdf_processor']

View File

@@ -8,20 +8,20 @@ import tempfile
from common.extensions import minio_client
import subprocess
from .transcription_processor import TranscriptionProcessor
from .processor_registry import ProcessorRegistry
from .transcription_processor import TranscriptionBaseProcessor
from common.utils.business_event_context import current_event
class AudioProcessor(TranscriptionProcessor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
self.transcription_client = model_variables['transcription_client']
self.transcription_model = model_variables['transcription_model']
class AudioProcessor(TranscriptionBaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.transcription_model = model_variables.transcription_model
self.ffmpeg_path = 'ffmpeg'
self.max_compression_duration = model_variables['max_compression_duration']
self.max_transcription_duration = model_variables['max_transcription_duration']
self.compression_cpu_limit = model_variables.get('compression_cpu_limit', 50) # CPU usage limit in percentage
self.compression_process_delay = model_variables.get('compression_process_delay', 0.1) # Delay between processing chunks in seconds
self.max_compression_duration = model_variables.max_compression_duration
self.max_transcription_duration = model_variables.max_transcription_duration
self.compression_cpu_limit = model_variables.compression_cpu_limit # CPU usage limit in percentage
self.compression_process_delay = model_variables.compression_process_delay # Delay between processing chunks in seconds
self.file_type = document_version.file_type
def _get_transcription(self):
@@ -39,26 +39,25 @@ class AudioProcessor(TranscriptionProcessor):
return transcription
def _compress_audio(self, audio_data):
self._log("Compressing audio")
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_file:
temp_file.write(audio_data)
temp_file_path = temp_file.name
try:
self._log("Creating AudioSegment from file")
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
self._log("Finished creating AudioSegment from file")
total_duration = len(audio_info)
self._log(f"Audio duration: {total_duration / 1000} seconds")
self._log_tuning("_compress_audio", {
"Audio Duration (ms)": total_duration,
})
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
total_chunks = (total_duration + segment_length - 1) // segment_length
compressed_segments = AudioSegment.empty()
for i in range(total_chunks):
self._log(f"Compressing segment {i + 1} of {total_chunks}")
self._log_tuning("_compress_audio", {
"Segment Nr": f"{i + 1} of {total_chunks}"
})
start_time = i * segment_length
end_time = min((i + 1) * segment_length, total_duration)
@@ -88,7 +87,9 @@ class AudioProcessor(TranscriptionProcessor):
compressed_filename,
compressed_buffer.read()
)
self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
self._log_tuning("_compress_audio", {
"Compressed audio to MinIO": compressed_filename
})
return compressed_segments
@@ -131,7 +132,6 @@ class AudioProcessor(TranscriptionProcessor):
return compressed_segment
def _transcribe_audio(self, audio_data):
self._log("Starting audio transcription")
# audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
audio = audio_data
@@ -140,7 +140,6 @@ class AudioProcessor(TranscriptionProcessor):
total_chunks = len(audio) // segment_length + 1
for i, chunk in enumerate(audio[::segment_length]):
self._log(f'Processing chunk {i + 1} of {total_chunks}')
segment_duration = 0
if i == total_chunks - 1:
segment_duration = (len(audio) % segment_length) // 1000
@@ -153,37 +152,34 @@ class AudioProcessor(TranscriptionProcessor):
try:
file_size = os.path.getsize(temp_audio.name)
self._log(f"Temporary audio file size: {file_size} bytes")
with open(temp_audio.name, 'rb') as audio_file:
file_start = audio_file.read(100)
self._log(f"First 100 bytes of audio file: {file_start}")
audio_file.seek(0) # Reset file pointer to the beginning
self._log("Calling transcription API")
transcription = self.model_variables.transcribe(
transcription = self.model_variables.transcription_model.transcribe(
file=audio_file,
model=self.transcription_model,
language=self.document_version.language,
response_format='verbose_json',
duration=segment_duration,
duration=segment_duration
)
self._log("Transcription API call completed")
if transcription:
trans = ""
# Handle the transcription result based on its type
if isinstance(transcription, str):
self._log(f"Transcription result (string): {transcription[:100]}...")
transcriptions.append(transcription)
trans = transcription
elif hasattr(transcription, 'text'):
self._log(
f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
transcriptions.append(transcription.text)
trans = transcription.text
else:
self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
transcriptions.append(str(transcription))
transcriptions.append(trans)
self._log_tuning("_transcribe_audio", {
"Chunk Nr": f"{i + 1} of {total_chunks}",
"Segment Duration": segment_duration,
"Transcription": trans,
})
else:
self._log("Warning: Received empty transcription", level='warning')
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
except Exception as e:
self._log(f"Error during transcription: {str(e)}", level='error')
@@ -206,7 +202,10 @@ class AudioProcessor(TranscriptionProcessor):
transcription_filename,
full_transcription.encode('utf-8')
)
self._log(f"Saved transcription to MinIO: {transcription_filename}")
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
return full_transcription
# Register the processor
ProcessorRegistry.register("AUDIO_PROCESSOR", AudioProcessor)

View File

@@ -1,14 +1,42 @@
from abc import ABC, abstractmethod
from typing import Dict, Any
from flask import current_app
from common.extensions import minio_client
from config.logging_config import TuningLogger
class Processor(ABC):
def __init__(self, tenant, model_variables, document_version):
class BaseProcessor(ABC):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
self.tenant = tenant
self.model_variables = model_variables
self.document_version = document_version
self.embed_tuning = model_variables['embed_tuning']
self.catalog = catalog
self.processor = processor
self.tuning = processor.tuning if processor else False
self.tuning_logger = None
self._setup_tuning_logger()
self._log_tuning("Processor initialized", {
"processor_type": processor.type if processor else None,
"document_version": document_version.id if document_version else None,
"catalog": catalog.id if catalog else None
})
def _setup_tuning_logger(self):
try:
self.tuning_logger = TuningLogger(
'tuning',
tenant_id=self.tenant.id if self.tenant else None,
catalog_id=self.catalog.id if self.catalog else None,
processor_id=self.processor.id if self.processor else None,
)
# Verify logger is working with a test message
if self.tuning:
self.tuning_logger.log_tuning('processor', "Tuning logger initialized")
except Exception as e:
current_app.logger.error(f"Failed to setup tuning logger: {str(e)}")
raise
@abstractmethod
def process(self):
@@ -50,3 +78,11 @@ class Processor(ABC):
return markdown
def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
if self.tuning and self.tuning_logger:
try:
self.tuning_logger.log_tuning('processor', message, data)
except Exception as e:
current_app.logger.error(f"Processor: Error in tuning logging: {e}")

View File

@@ -4,21 +4,34 @@ from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.extensions import db, minio_client
from common.utils.model_utils import create_language_template
from .processor import Processor
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
from common.utils.string_list_converter import StringListConverter as SLC
class HTMLProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
self.html_tags = model_variables['html_tags']
self.html_end_tags = model_variables['html_end_tags']
self.html_included_elements = model_variables['html_included_elements']
self.html_excluded_elements = model_variables['html_excluded_elements']
self.html_excluded_classes = model_variables['html_excluded_classes']
self.chunk_size = model_variables['processing_chunk_size'] # Adjust this based on your LLM's optimal input size
self.chunk_overlap = model_variables[
'processing_chunk_overlap'] # Adjust for context preservation between chunks
class HTMLProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
cat_conf = catalog.configuration
proc_conf = processor.configuration
self.html_tags = SLC.string_to_list(proc_conf['html_tags'])
self.html_end_tags = SLC.string_to_list(proc_conf['html_end_tags'])
self.html_included_elements = SLC.string_to_list(proc_conf['html_included_elements'])
self.html_excluded_elements = SLC.string_to_list(proc_conf['html_excluded_elements'])
self.html_excluded_classes = SLC.string_to_list(proc_conf['html_excluded_classes'])
self.tuning = self.processor.tuning
# Add verification logging
self._log(f"HTML Processor initialized with tuning={self.tuning}")
if self.tuning:
self._log_tuning("HTML Processor initialized", {
"html_tags": self.html_tags,
"html_end_tags": self.html_end_tags,
"included_elements": self.html_included_elements,
"excluded_elements": self.html_excluded_elements
})
self.chunk_size = catalog.max_chunk_size
def process(self):
self._log("Starting HTML processing")
@@ -62,13 +75,14 @@ class HTMLProcessor(Processor):
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
return extracted_html, title
def _generate_markdown_from_html(self, html_content):
self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')
llm = self.model_variables['llm']
template = self.model_variables['html_parse_template']
llm = self.model_variables.get_llm()
template = self.model_variables.get_template("html_parse")
parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
@@ -79,13 +93,10 @@ class HTMLProcessor(Processor):
markdown_chunks = []
for chunk in chunks:
if self.embed_tuning:
self._log(f'Processing chunk: \n{chunk}\n')
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
if self.embed_tuning:
self._log(f'Processed markdown chunk: \n{markdown_chunk}\n')
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
markdown = "\n\n".join(markdown_chunks)
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
@@ -146,3 +157,7 @@ class HTMLProcessor(Processor):
def _extract_element_content(self, element):
content = ' '.join(child.strip() for child in element.stripped_strings)
return f'<{element.name}>{content}</{element.name}>\n'
# Register the processor
ProcessorRegistry.register("HTML_PROCESSOR", HTMLProcessor)

View File

@@ -9,18 +9,18 @@ from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .processor import Processor
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
class PDFProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
# PDF-specific initialization
self.chunk_size = model_variables['processing_chunk_size']
self.chunk_overlap = model_variables['processing_chunk_overlap']
self.min_chunk_size = model_variables['processing_min_chunk_size']
self.max_chunk_size = model_variables['processing_max_chunk_size']
class PDFProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
def process(self):
self._log("Starting PDF processing")
@@ -38,7 +38,8 @@ class PDFProcessor(Processor):
with current_event.create_span("Markdown Generation"):
llm_chunks = self._split_content_for_llm(structured_content)
markdown = self._process_chunks_with_llm(llm_chunks)
self._save_markdown(markdown)
self._save_markdown(markdown)
self._log("Finished processing PDF")
return markdown, title
except Exception as e:
@@ -56,19 +57,10 @@ class PDFProcessor(Processor):
'figures': self._extract_figures(page, page_num, figure_counter),
'tables': self._extract_tables(page)
}
if self.embed_tuning:
self._log(f'Extracted PDF Content for page {page_num + 1}')
self._log(f"{page_content }")
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
figure_counter += len(page_content['figures'])
extracted_content.append(page_content)
# if self.embed_tuning:
# current_app.embed_tuning_logger.debug(f'Extracted PDF Content')
# current_app.embed_tuning_logger.debug(f'---------------------')
# current_app.embed_tuning_logger.debug(f'Page: {page_content}')
# current_app.embed_tuning_logger.debug(f'End of Extracted PDF Content')
# current_app.embed_tuning_logger.debug(f'----------------------------')
return extracted_content
def _extract_figures(self, page, page_num, figure_counter):
@@ -127,6 +119,7 @@ class PDFProcessor(Processor):
markdown_table = self._table_to_markdown(table)
if markdown_table: # Only add non-empty tables
tables.append(markdown_table)
self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
except Exception as e:
self._log(f"Error extracting tables from page: {str(e)}", level='error')
return tables
@@ -202,7 +195,7 @@ class PDFProcessor(Processor):
for table in page['tables']:
structured_content += f"\n{table}\n"
if self.embed_tuning:
if self.tuning:
self._save_intermediate(structured_content, "structured_content.md")
return structured_content, title
@@ -217,8 +210,8 @@ class PDFProcessor(Processor):
return text_splitter.split_text(content)
def _process_chunks_with_llm(self, chunks):
llm = self.model_variables['llm']
template = self.model_variables['pdf_parse_template']
llm = self.model_variables.get_llm()
template = self.model_variables.get_template('pdf_parse')
pdf_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
@@ -232,3 +225,7 @@ class PDFProcessor(Processor):
markdown_chunks.append(result)
return "\n\n".join(markdown_chunks)
# Register the processor
ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)

View File

@@ -0,0 +1,92 @@
from typing import Dict, Type, Optional
from flask import current_app
from config.processor_types import PROCESSOR_TYPES
from .base_processor import BaseProcessor
class ProcessorRegistry:
"""Registry for processor types that aligns with PROCESSOR_TYPES configuration"""
_registry: Dict[str, Type[BaseProcessor]] = {}
@classmethod
def register(cls, processor_type: str, processor_class: Type[BaseProcessor]):
"""
Register a new processor type that must match a type in PROCESSOR_TYPES
Args:
processor_type: Type identifier from PROCESSOR_TYPES
processor_class: Processor implementation class
Raises:
ValueError: If processor_type isn't defined in PROCESSOR_TYPES
"""
if processor_type not in PROCESSOR_TYPES:
raise ValueError(f"Processor type {processor_type} not found in PROCESSOR_TYPES configuration")
cls._registry[processor_type] = processor_class
@classmethod
def get_processor_class(cls, processor_type: str) -> Type[BaseProcessor]:
"""
Get the processor class for a given processor type
Args:
processor_type: Type identifier from PROCESSOR_TYPES
Returns:
The registered processor class
Raises:
ValueError: If no processor is registered for the given type
"""
if processor_type not in cls._registry:
raise ValueError(f"No processor registered for type: {processor_type}")
return cls._registry[processor_type]
@classmethod
def get_processor_for_file_type(cls, file_type: str) -> tuple[str, Type[BaseProcessor]]:
"""
Find appropriate processor for a file type by checking PROCESSOR_TYPES definitions
Args:
file_type: File extension (e.g., 'html', 'pdf')
Returns:
Tuple of (processor_type, processor_class)
Raises:
ValueError: If no processor is found for the file type
"""
# First find which processor type handles this file type
for proc_type, config in PROCESSOR_TYPES.items():
# Check if file_type is in the supported file_types (handling both string and list formats)
supported_types = config['file_types']
if isinstance(supported_types, str):
supported_types = [t.strip() for t in supported_types.split(',')]
if file_type in supported_types:
# Get the registered processor class for this type
if proc_type in cls._registry:
return proc_type, cls._registry[proc_type]
else:
raise ValueError(
f"Found processor type {proc_type} for file type {file_type} but no processor is registered")
raise ValueError(f"No processor type found for file type: {file_type}")
@classmethod
def validate_processor_registration(cls):
"""
Validate that all PROCESSOR_TYPES have registered processors
Raises:
ValueError: If any processor type lacks a registered processor
"""
missing_processors = []
for proc_type in PROCESSOR_TYPES.keys():
if proc_type not in cls._registry:
missing_processors.append(proc_type)
if missing_processors:
raise ValueError(f"Missing processor registrations for: {', '.join(missing_processors)}")

View File

@@ -1,9 +1,9 @@
from common.extensions import minio_client
from .transcription_processor import TranscriptionProcessor
from .transcription_processor import TranscriptionBaseProcessor
import re
class SRTProcessor(TranscriptionProcessor):
class SRTProcessor(TranscriptionBaseProcessor):
def _get_transcription(self):
file_data = minio_client.download_document_file(
self.tenant.id,

View File

@@ -5,15 +5,15 @@ from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.utils.model_utils import create_language_template
from .processor import Processor
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
class TranscriptionProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
self.chunk_size = model_variables['processing_chunk_size']
self.chunk_overlap = model_variables['processing_chunk_overlap']
class TranscriptionBaseProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.annotation_chunk_size = model_variables.annotation_chunk_length
self.annotation_chunk_overlap = 0
def process(self):
self._log("Starting Transcription processing")
@@ -37,17 +37,17 @@ class TranscriptionProcessor(Processor):
def _chunk_transcription(self, transcription):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
chunk_size=self.annotation_chunk_size,
chunk_overlap=self.annotation_chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
return text_splitter.split_text(transcription)
def _process_chunks(self, chunks):
self._log("Generating markdown from transcription")
llm = self.model_variables['llm']
template = self.model_variables['transcript_template']
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
llm = self.model_variables.get_llm()
template = self.model_variables.get_template('transcript')
language_template = create_language_template(template, self.document_version.language)
transcript_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
@@ -58,14 +58,18 @@ class TranscriptionProcessor(Processor):
markdown_chunks = []
previous_part = ""
for i, chunk in enumerate(chunks):
self._log(f"Processing chunk {i + 1} of {len(chunks)}")
self._log(f"Previous part: {previous_part}")
input_transcript = {
'transcript': chunk,
'previous_part': previous_part
}
markdown = chain.invoke(input_transcript)
markdown = self._clean_markdown(markdown)
self._log_tuning("_process_chunks", {
"Chunk Number": f"{i + 1} of {len(chunks)}",
"Chunk": chunk,
"Previous Chunk": previous_part,
"Markdown": markdown,
})
markdown_chunks.append(markdown)
# Extract the last part for the next iteration

View File

@@ -10,22 +10,20 @@ from langchain_core.exceptions import LangChainException
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from sqlalchemy import or_
from sqlalchemy.exc import SQLAlchemyError
from common.extensions import db, minio_client
from common.models.document import DocumentVersion, Embedding, Document
from common.extensions import db, minio_client, template_manager
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
from common.models.user import Tenant
from common.utils.celery_utils import current_celery
from common.utils.database import Database
from common.utils.model_utils import select_model_variables, create_language_template
from common.utils.os_utils import safe_remove, sync_folder
from eveai_workers.Processors.audio_processor import AudioProcessor
from eveai_workers.Processors.html_processor import HTMLProcessor
from eveai_workers.Processors.pdf_processor import PDFProcessor
from eveai_workers.Processors.srt_processor import SRTProcessor
from common.utils.model_utils import create_language_template, get_model_variables
from common.utils.business_event import BusinessEvent
from common.utils.business_event_context import current_event
from config.processor_types import PROCESSOR_TYPES
from eveai_workers.processors.processor_registry import ProcessorRegistry
# Healthcheck task
@@ -53,14 +51,18 @@ def create_embeddings(tenant_id, document_version_id):
# Retrieve the Catalog ID
doc = Document.query.get_or_404(document_version.doc_id)
catalog_id = doc.catalog_id
catalog = Catalog.query.get_or_404(catalog_id)
# Select variables to work with depending on tenant and model
model_variables = select_model_variables(tenant, catalog_id=catalog_id)
current_app.logger.debug(f'Model variables: {model_variables}')
model_variables = get_model_variables(tenant_id)
# Define processor related information
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
except Exception as e:
current_app.logger.error(f'Create Embeddings request received '
f'for non existing document version {document_version_id} '
f'for badly configured document version {document_version_id} '
f'for tenant {tenant_id}, '
f'error: {e}')
raise
@@ -90,19 +92,19 @@ def create_embeddings(tenant_id, document_version_id):
delete_embeddings_for_document_version(document_version)
try:
match document_version.file_type:
case 'pdf':
process_pdf(tenant, model_variables, document_version)
case 'html':
process_html(tenant, model_variables, document_version)
case 'srt':
process_srt(tenant, model_variables, document_version)
case 'mp4' | 'mp3' | 'ogg':
process_audio(tenant, model_variables, document_version)
case _:
raise Exception(f'No functionality defined for file type {document_version.file_type} '
f'for tenant {tenant_id} '
f'while creating embeddings for document version {document_version_id}')
with current_event.create_span(f"{processor_type} Processing"):
document_processor = processor_class(
tenant=tenant,
model_variables=model_variables,
document_version=document_version,
catalog=catalog,
processor=processor
)
markdown, title = document_processor.process()
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
current_event.log("Finished Embedding Creation Task")
except Exception as e:
@@ -129,53 +131,12 @@ def delete_embeddings_for_document_version(document_version):
raise
def process_pdf(tenant, model_variables, document_version):
with current_event.create_span("PDF Processing"):
processor = PDFProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, markdown, title)
def process_html(tenant, model_variables, document_version):
with current_event.create_span("HTML Processing"):
processor = HTMLProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, markdown, title)
def process_audio(tenant, model_variables, document_version):
with current_event.create_span("Audio Processing"):
processor = AudioProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, markdown, title)
def process_srt(tenant, model_variables, document_version):
with current_event.create_span("SRT Processing"):
processor = SRTProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, markdown, title)
def embed_markdown(tenant, model_variables, document_version, markdown, title):
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
# Create potential chunks
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
# Combine chunks for embedding
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
# Enrich chunks
with current_event.create_span("Enrich Chunks"):
@@ -203,9 +164,6 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):
def enrich_chunks(tenant, model_variables, document_version, title, chunks):
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
summary = ''
if len(chunks) > 1:
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
@@ -233,18 +191,13 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
enriched_chunk = f'{chunk_total_context}\n{chunk}'
enriched_chunks.append(enriched_chunk)
current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
return enriched_chunks
def summarize_chunk(tenant, model_variables, document_version, chunk):
current_event.log("Starting Summarizing Chunk")
current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}')
llm = model_variables['llm']
template = model_variables['summary_template']
llm = model_variables.get_llm()
template = model_variables.get_template("summary")
language_template = create_language_template(template, document_version.language)
summary_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
@@ -253,11 +206,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
chain = setup | summary_prompt | llm | output_parser
try:
current_app.logger.debug(f'Starting summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}')
summary = chain.invoke({"text": chunk})
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}.')
current_event.log("Finished Summarizing Chunk")
return summary
except LangChainException as e:
@@ -268,14 +217,10 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
def embed_chunks(tenant, model_variables, document_version, chunks):
current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
embedding_model = model_variables['embedding_model']
embedding_model = model_variables.embedding_model
try:
embeddings = embedding_model.embed_documents(chunks)
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
except LangChainException as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} while calling OpenAI API'
@@ -285,28 +230,16 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
# Add embeddings to the database
new_embeddings = []
for chunk, embedding in zip(chunks, embeddings):
new_embedding = model_variables['embedding_db_model']()
new_embedding = model_variables.embedding_model_class()
new_embedding.document_version = document_version
new_embedding.active = True
new_embedding.chunk = chunk
new_embedding.embedding = embedding
new_embeddings.append(new_embedding)
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} ')
return new_embeddings
def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
try:
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
@@ -328,7 +261,6 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits]
current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
return potential_chunks
except Exception as e:
current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
@@ -361,3 +293,69 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
actual_chunks.append(current_chunk)
return actual_chunks
def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: str = None) -> Processor:
"""
Get the appropriate processor for a document based on catalog_id, file_type and optional sub_file_type.
Args:
catalog_id: ID of the catalog
file_type: Type of file (e.g., 'pdf', 'html')
sub_file_type: Optional sub-type for specialized processing
Returns:
Processor instance
Raises:
ValueError: If no matching processor is found
"""
try:
# Start with base query for catalog
query = Processor.query.filter_by(catalog_id=catalog_id)
# Find processor type that handles this file type
matching_processor_type = None
for proc_type, config in PROCESSOR_TYPES.items():
supported_types = config['file_types']
if isinstance(supported_types, str):
supported_types = [t.strip() for t in supported_types.split(',')]
if file_type in supported_types:
matching_processor_type = proc_type
break
if not matching_processor_type:
raise ValueError(f"No processor type found for file type: {file_type}")
# Add processor type condition
query = query.filter_by(type=matching_processor_type)
# If sub_file_type is provided, add that condition
if sub_file_type:
query = query.filter_by(sub_file_type=sub_file_type)
else:
# If no sub_file_type, prefer processors without sub_file_type specification
query = query.filter(or_(Processor.sub_file_type.is_(None),
Processor.sub_file_type == ''))
# Get the first matching processor
processor = query.first()
if not processor:
if sub_file_type:
raise ValueError(
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
f"file type {file_type}, sub-type {sub_file_type}"
)
else:
raise ValueError(
f"No processor found for catalog {catalog_id}, "
f"file type {file_type}"
)
return processor
except Exception as e:
current_app.logger.error(f"Error finding processor: {str(e)}")
raise