- Introduction of dynamic Retrievers & Specialists
- Introduction of dynamic Processors - Introduction of caching system - Introduction of a better template manager - Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists - Start adaptation of chat client
This commit is contained in:
@@ -4,10 +4,12 @@ from flask import Flask
|
||||
import os
|
||||
|
||||
from common.utils.celery_utils import make_celery, init_celery
|
||||
from common.extensions import db, minio_client
|
||||
from config.logging_config import LOGGING
|
||||
from common.extensions import db, minio_client, template_manager, cache_manager
|
||||
import config.logging_config as logging_config
|
||||
from config.config import get_config
|
||||
|
||||
from . import processors
|
||||
|
||||
|
||||
def create_app(config_file=None):
|
||||
app = Flask(__name__)
|
||||
@@ -22,8 +24,7 @@ def create_app(config_file=None):
|
||||
case _:
|
||||
app.config.from_object(get_config('dev'))
|
||||
|
||||
logging.config.dictConfig(LOGGING)
|
||||
app.embed_tuning_logger = logging.getLogger('embed_tuning')
|
||||
logging.config.dictConfig(logging_config.LOGGING)
|
||||
|
||||
register_extensions(app)
|
||||
|
||||
@@ -41,6 +42,8 @@ def create_app(config_file=None):
|
||||
def register_extensions(app):
|
||||
db.init_app(app)
|
||||
minio_client.init_app(app)
|
||||
cache_manager.init_app(app)
|
||||
template_manager.init_app(app)
|
||||
|
||||
|
||||
app, celery = create_app()
|
||||
|
||||
5
eveai_workers/processors/__init__.py
Normal file
5
eveai_workers/processors/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# Import all processor implementations to ensure registration
|
||||
from . import audio_processor, html_processor, pdf_processor
|
||||
|
||||
# List of all available processor implementations
|
||||
__all__ = ['audio_processor', 'html_processor', 'pdf_processor']
|
||||
@@ -8,20 +8,20 @@ import tempfile
|
||||
from common.extensions import minio_client
|
||||
import subprocess
|
||||
|
||||
from .transcription_processor import TranscriptionProcessor
|
||||
from .processor_registry import ProcessorRegistry
|
||||
from .transcription_processor import TranscriptionBaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class AudioProcessor(TranscriptionProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
super().__init__(tenant, model_variables, document_version)
|
||||
self.transcription_client = model_variables['transcription_client']
|
||||
self.transcription_model = model_variables['transcription_model']
|
||||
class AudioProcessor(TranscriptionBaseProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||
self.transcription_model = model_variables.transcription_model
|
||||
self.ffmpeg_path = 'ffmpeg'
|
||||
self.max_compression_duration = model_variables['max_compression_duration']
|
||||
self.max_transcription_duration = model_variables['max_transcription_duration']
|
||||
self.compression_cpu_limit = model_variables.get('compression_cpu_limit', 50) # CPU usage limit in percentage
|
||||
self.compression_process_delay = model_variables.get('compression_process_delay', 0.1) # Delay between processing chunks in seconds
|
||||
self.max_compression_duration = model_variables.max_compression_duration
|
||||
self.max_transcription_duration = model_variables.max_transcription_duration
|
||||
self.compression_cpu_limit = model_variables.compression_cpu_limit # CPU usage limit in percentage
|
||||
self.compression_process_delay = model_variables.compression_process_delay # Delay between processing chunks in seconds
|
||||
self.file_type = document_version.file_type
|
||||
|
||||
def _get_transcription(self):
|
||||
@@ -39,26 +39,25 @@ class AudioProcessor(TranscriptionProcessor):
|
||||
return transcription
|
||||
|
||||
def _compress_audio(self, audio_data):
|
||||
self._log("Compressing audio")
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_file:
|
||||
temp_file.write(audio_data)
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
try:
|
||||
self._log("Creating AudioSegment from file")
|
||||
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
|
||||
self._log("Finished creating AudioSegment from file")
|
||||
total_duration = len(audio_info)
|
||||
self._log(f"Audio duration: {total_duration / 1000} seconds")
|
||||
|
||||
self._log_tuning("_compress_audio", {
|
||||
"Audio Duration (ms)": total_duration,
|
||||
})
|
||||
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
|
||||
total_chunks = (total_duration + segment_length - 1) // segment_length
|
||||
|
||||
compressed_segments = AudioSegment.empty()
|
||||
|
||||
for i in range(total_chunks):
|
||||
self._log(f"Compressing segment {i + 1} of {total_chunks}")
|
||||
self._log_tuning("_compress_audio", {
|
||||
"Segment Nr": f"{i + 1} of {total_chunks}"
|
||||
})
|
||||
|
||||
start_time = i * segment_length
|
||||
end_time = min((i + 1) * segment_length, total_duration)
|
||||
@@ -88,7 +87,9 @@ class AudioProcessor(TranscriptionProcessor):
|
||||
compressed_filename,
|
||||
compressed_buffer.read()
|
||||
)
|
||||
self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
|
||||
self._log_tuning("_compress_audio", {
|
||||
"Compressed audio to MinIO": compressed_filename
|
||||
})
|
||||
|
||||
return compressed_segments
|
||||
|
||||
@@ -131,7 +132,6 @@ class AudioProcessor(TranscriptionProcessor):
|
||||
return compressed_segment
|
||||
|
||||
def _transcribe_audio(self, audio_data):
|
||||
self._log("Starting audio transcription")
|
||||
# audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
audio = audio_data
|
||||
|
||||
@@ -140,7 +140,6 @@ class AudioProcessor(TranscriptionProcessor):
|
||||
total_chunks = len(audio) // segment_length + 1
|
||||
|
||||
for i, chunk in enumerate(audio[::segment_length]):
|
||||
self._log(f'Processing chunk {i + 1} of {total_chunks}')
|
||||
segment_duration = 0
|
||||
if i == total_chunks - 1:
|
||||
segment_duration = (len(audio) % segment_length) // 1000
|
||||
@@ -153,37 +152,34 @@ class AudioProcessor(TranscriptionProcessor):
|
||||
|
||||
try:
|
||||
file_size = os.path.getsize(temp_audio.name)
|
||||
self._log(f"Temporary audio file size: {file_size} bytes")
|
||||
|
||||
with open(temp_audio.name, 'rb') as audio_file:
|
||||
file_start = audio_file.read(100)
|
||||
self._log(f"First 100 bytes of audio file: {file_start}")
|
||||
audio_file.seek(0) # Reset file pointer to the beginning
|
||||
|
||||
self._log("Calling transcription API")
|
||||
transcription = self.model_variables.transcribe(
|
||||
transcription = self.model_variables.transcription_model.transcribe(
|
||||
file=audio_file,
|
||||
model=self.transcription_model,
|
||||
language=self.document_version.language,
|
||||
response_format='verbose_json',
|
||||
duration=segment_duration,
|
||||
duration=segment_duration
|
||||
)
|
||||
self._log("Transcription API call completed")
|
||||
|
||||
if transcription:
|
||||
trans = ""
|
||||
# Handle the transcription result based on its type
|
||||
if isinstance(transcription, str):
|
||||
self._log(f"Transcription result (string): {transcription[:100]}...")
|
||||
transcriptions.append(transcription)
|
||||
trans = transcription
|
||||
elif hasattr(transcription, 'text'):
|
||||
self._log(
|
||||
f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
|
||||
transcriptions.append(transcription.text)
|
||||
trans = transcription.text
|
||||
else:
|
||||
self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
|
||||
transcriptions.append(str(transcription))
|
||||
|
||||
transcriptions.append(trans)
|
||||
|
||||
self._log_tuning("_transcribe_audio", {
|
||||
"Chunk Nr": f"{i + 1} of {total_chunks}",
|
||||
"Segment Duration": segment_duration,
|
||||
"Transcription": trans,
|
||||
})
|
||||
else:
|
||||
self._log("Warning: Received empty transcription", level='warning')
|
||||
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
|
||||
|
||||
except Exception as e:
|
||||
self._log(f"Error during transcription: {str(e)}", level='error')
|
||||
@@ -206,7 +202,10 @@ class AudioProcessor(TranscriptionProcessor):
|
||||
transcription_filename,
|
||||
full_transcription.encode('utf-8')
|
||||
)
|
||||
self._log(f"Saved transcription to MinIO: {transcription_filename}")
|
||||
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
|
||||
|
||||
return full_transcription
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("AUDIO_PROCESSOR", AudioProcessor)
|
||||
@@ -1,14 +1,42 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any
|
||||
|
||||
from flask import current_app
|
||||
from common.extensions import minio_client
|
||||
from config.logging_config import TuningLogger
|
||||
|
||||
|
||||
class Processor(ABC):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
class BaseProcessor(ABC):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
self.tenant = tenant
|
||||
self.model_variables = model_variables
|
||||
self.document_version = document_version
|
||||
self.embed_tuning = model_variables['embed_tuning']
|
||||
self.catalog = catalog
|
||||
self.processor = processor
|
||||
self.tuning = processor.tuning if processor else False
|
||||
self.tuning_logger = None
|
||||
self._setup_tuning_logger()
|
||||
|
||||
self._log_tuning("Processor initialized", {
|
||||
"processor_type": processor.type if processor else None,
|
||||
"document_version": document_version.id if document_version else None,
|
||||
"catalog": catalog.id if catalog else None
|
||||
})
|
||||
|
||||
def _setup_tuning_logger(self):
|
||||
try:
|
||||
self.tuning_logger = TuningLogger(
|
||||
'tuning',
|
||||
tenant_id=self.tenant.id if self.tenant else None,
|
||||
catalog_id=self.catalog.id if self.catalog else None,
|
||||
processor_id=self.processor.id if self.processor else None,
|
||||
)
|
||||
# Verify logger is working with a test message
|
||||
if self.tuning:
|
||||
self.tuning_logger.log_tuning('processor', "Tuning logger initialized")
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Failed to setup tuning logger: {str(e)}")
|
||||
raise
|
||||
|
||||
@abstractmethod
|
||||
def process(self):
|
||||
@@ -50,3 +78,11 @@ class Processor(ABC):
|
||||
|
||||
return markdown
|
||||
|
||||
def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
|
||||
if self.tuning and self.tuning_logger:
|
||||
try:
|
||||
self.tuning_logger.log_tuning('processor', message, data)
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Processor: Error in tuning logging: {e}")
|
||||
|
||||
|
||||
@@ -4,21 +4,34 @@ from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from common.extensions import db, minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
from .processor_registry import ProcessorRegistry
|
||||
from common.utils.string_list_converter import StringListConverter as SLC
|
||||
|
||||
|
||||
class HTMLProcessor(Processor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
super().__init__(tenant, model_variables, document_version)
|
||||
self.html_tags = model_variables['html_tags']
|
||||
self.html_end_tags = model_variables['html_end_tags']
|
||||
self.html_included_elements = model_variables['html_included_elements']
|
||||
self.html_excluded_elements = model_variables['html_excluded_elements']
|
||||
self.html_excluded_classes = model_variables['html_excluded_classes']
|
||||
self.chunk_size = model_variables['processing_chunk_size'] # Adjust this based on your LLM's optimal input size
|
||||
self.chunk_overlap = model_variables[
|
||||
'processing_chunk_overlap'] # Adjust for context preservation between chunks
|
||||
class HTMLProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||
cat_conf = catalog.configuration
|
||||
proc_conf = processor.configuration
|
||||
self.html_tags = SLC.string_to_list(proc_conf['html_tags'])
|
||||
self.html_end_tags = SLC.string_to_list(proc_conf['html_end_tags'])
|
||||
self.html_included_elements = SLC.string_to_list(proc_conf['html_included_elements'])
|
||||
self.html_excluded_elements = SLC.string_to_list(proc_conf['html_excluded_elements'])
|
||||
self.html_excluded_classes = SLC.string_to_list(proc_conf['html_excluded_classes'])
|
||||
self.tuning = self.processor.tuning
|
||||
# Add verification logging
|
||||
self._log(f"HTML Processor initialized with tuning={self.tuning}")
|
||||
if self.tuning:
|
||||
self._log_tuning("HTML Processor initialized", {
|
||||
"html_tags": self.html_tags,
|
||||
"html_end_tags": self.html_end_tags,
|
||||
"included_elements": self.html_included_elements,
|
||||
"excluded_elements": self.html_excluded_elements
|
||||
})
|
||||
|
||||
self.chunk_size = catalog.max_chunk_size
|
||||
|
||||
def process(self):
|
||||
self._log("Starting HTML processing")
|
||||
@@ -62,13 +75,14 @@ class HTMLProcessor(Processor):
|
||||
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
|
||||
|
||||
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
|
||||
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
|
||||
return extracted_html, title
|
||||
|
||||
def _generate_markdown_from_html(self, html_content):
|
||||
self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')
|
||||
|
||||
llm = self.model_variables['llm']
|
||||
template = self.model_variables['html_parse_template']
|
||||
llm = self.model_variables.get_llm()
|
||||
template = self.model_variables.get_template("html_parse")
|
||||
parse_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
@@ -79,13 +93,10 @@ class HTMLProcessor(Processor):
|
||||
|
||||
markdown_chunks = []
|
||||
for chunk in chunks:
|
||||
if self.embed_tuning:
|
||||
self._log(f'Processing chunk: \n{chunk}\n')
|
||||
input_html = {"html": chunk}
|
||||
markdown_chunk = chain.invoke(input_html)
|
||||
markdown_chunks.append(markdown_chunk)
|
||||
if self.embed_tuning:
|
||||
self._log(f'Processed markdown chunk: \n{markdown_chunk}\n')
|
||||
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
|
||||
|
||||
markdown = "\n\n".join(markdown_chunks)
|
||||
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
|
||||
@@ -146,3 +157,7 @@ class HTMLProcessor(Processor):
|
||||
def _extract_element_content(self, element):
|
||||
content = ' '.join(child.strip() for child in element.stripped_strings)
|
||||
return f'<{element.name}>{content}</{element.name}>\n'
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("HTML_PROCESSOR", HTMLProcessor)
|
||||
@@ -9,18 +9,18 @@ from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
from .processor_registry import ProcessorRegistry
|
||||
|
||||
|
||||
class PDFProcessor(Processor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
super().__init__(tenant, model_variables, document_version)
|
||||
# PDF-specific initialization
|
||||
self.chunk_size = model_variables['processing_chunk_size']
|
||||
self.chunk_overlap = model_variables['processing_chunk_overlap']
|
||||
self.min_chunk_size = model_variables['processing_min_chunk_size']
|
||||
self.max_chunk_size = model_variables['processing_max_chunk_size']
|
||||
class PDFProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||
|
||||
self.chunk_size = catalog.max_chunk_size
|
||||
self.chunk_overlap = 0
|
||||
self.tuning = self.processor.tuning
|
||||
|
||||
def process(self):
|
||||
self._log("Starting PDF processing")
|
||||
@@ -38,7 +38,8 @@ class PDFProcessor(Processor):
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
llm_chunks = self._split_content_for_llm(structured_content)
|
||||
markdown = self._process_chunks_with_llm(llm_chunks)
|
||||
self._save_markdown(markdown)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing PDF")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
@@ -56,19 +57,10 @@ class PDFProcessor(Processor):
|
||||
'figures': self._extract_figures(page, page_num, figure_counter),
|
||||
'tables': self._extract_tables(page)
|
||||
}
|
||||
if self.embed_tuning:
|
||||
self._log(f'Extracted PDF Content for page {page_num + 1}')
|
||||
self._log(f"{page_content }")
|
||||
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
|
||||
figure_counter += len(page_content['figures'])
|
||||
extracted_content.append(page_content)
|
||||
|
||||
# if self.embed_tuning:
|
||||
# current_app.embed_tuning_logger.debug(f'Extracted PDF Content')
|
||||
# current_app.embed_tuning_logger.debug(f'---------------------')
|
||||
# current_app.embed_tuning_logger.debug(f'Page: {page_content}')
|
||||
# current_app.embed_tuning_logger.debug(f'End of Extracted PDF Content')
|
||||
# current_app.embed_tuning_logger.debug(f'----------------------------')
|
||||
|
||||
return extracted_content
|
||||
|
||||
def _extract_figures(self, page, page_num, figure_counter):
|
||||
@@ -127,6 +119,7 @@ class PDFProcessor(Processor):
|
||||
markdown_table = self._table_to_markdown(table)
|
||||
if markdown_table: # Only add non-empty tables
|
||||
tables.append(markdown_table)
|
||||
self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
|
||||
except Exception as e:
|
||||
self._log(f"Error extracting tables from page: {str(e)}", level='error')
|
||||
return tables
|
||||
@@ -202,7 +195,7 @@ class PDFProcessor(Processor):
|
||||
for table in page['tables']:
|
||||
structured_content += f"\n{table}\n"
|
||||
|
||||
if self.embed_tuning:
|
||||
if self.tuning:
|
||||
self._save_intermediate(structured_content, "structured_content.md")
|
||||
|
||||
return structured_content, title
|
||||
@@ -217,8 +210,8 @@ class PDFProcessor(Processor):
|
||||
return text_splitter.split_text(content)
|
||||
|
||||
def _process_chunks_with_llm(self, chunks):
|
||||
llm = self.model_variables['llm']
|
||||
template = self.model_variables['pdf_parse_template']
|
||||
llm = self.model_variables.get_llm()
|
||||
template = self.model_variables.get_template('pdf_parse')
|
||||
pdf_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
@@ -232,3 +225,7 @@ class PDFProcessor(Processor):
|
||||
markdown_chunks.append(result)
|
||||
|
||||
return "\n\n".join(markdown_chunks)
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)
|
||||
92
eveai_workers/processors/processor_registry.py
Normal file
92
eveai_workers/processors/processor_registry.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from typing import Dict, Type, Optional
|
||||
from flask import current_app
|
||||
from config.processor_types import PROCESSOR_TYPES
|
||||
from .base_processor import BaseProcessor
|
||||
|
||||
|
||||
class ProcessorRegistry:
|
||||
"""Registry for processor types that aligns with PROCESSOR_TYPES configuration"""
|
||||
|
||||
_registry: Dict[str, Type[BaseProcessor]] = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, processor_type: str, processor_class: Type[BaseProcessor]):
|
||||
"""
|
||||
Register a new processor type that must match a type in PROCESSOR_TYPES
|
||||
|
||||
Args:
|
||||
processor_type: Type identifier from PROCESSOR_TYPES
|
||||
processor_class: Processor implementation class
|
||||
|
||||
Raises:
|
||||
ValueError: If processor_type isn't defined in PROCESSOR_TYPES
|
||||
"""
|
||||
if processor_type not in PROCESSOR_TYPES:
|
||||
raise ValueError(f"Processor type {processor_type} not found in PROCESSOR_TYPES configuration")
|
||||
|
||||
cls._registry[processor_type] = processor_class
|
||||
|
||||
@classmethod
|
||||
def get_processor_class(cls, processor_type: str) -> Type[BaseProcessor]:
|
||||
"""
|
||||
Get the processor class for a given processor type
|
||||
|
||||
Args:
|
||||
processor_type: Type identifier from PROCESSOR_TYPES
|
||||
|
||||
Returns:
|
||||
The registered processor class
|
||||
|
||||
Raises:
|
||||
ValueError: If no processor is registered for the given type
|
||||
"""
|
||||
if processor_type not in cls._registry:
|
||||
raise ValueError(f"No processor registered for type: {processor_type}")
|
||||
return cls._registry[processor_type]
|
||||
|
||||
@classmethod
|
||||
def get_processor_for_file_type(cls, file_type: str) -> tuple[str, Type[BaseProcessor]]:
|
||||
"""
|
||||
Find appropriate processor for a file type by checking PROCESSOR_TYPES definitions
|
||||
|
||||
Args:
|
||||
file_type: File extension (e.g., 'html', 'pdf')
|
||||
|
||||
Returns:
|
||||
Tuple of (processor_type, processor_class)
|
||||
|
||||
Raises:
|
||||
ValueError: If no processor is found for the file type
|
||||
"""
|
||||
# First find which processor type handles this file type
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
# Check if file_type is in the supported file_types (handling both string and list formats)
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
|
||||
if file_type in supported_types:
|
||||
# Get the registered processor class for this type
|
||||
if proc_type in cls._registry:
|
||||
return proc_type, cls._registry[proc_type]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Found processor type {proc_type} for file type {file_type} but no processor is registered")
|
||||
|
||||
raise ValueError(f"No processor type found for file type: {file_type}")
|
||||
|
||||
@classmethod
|
||||
def validate_processor_registration(cls):
|
||||
"""
|
||||
Validate that all PROCESSOR_TYPES have registered processors
|
||||
|
||||
Raises:
|
||||
ValueError: If any processor type lacks a registered processor
|
||||
"""
|
||||
missing_processors = []
|
||||
for proc_type in PROCESSOR_TYPES.keys():
|
||||
if proc_type not in cls._registry:
|
||||
missing_processors.append(proc_type)
|
||||
|
||||
if missing_processors:
|
||||
raise ValueError(f"Missing processor registrations for: {', '.join(missing_processors)}")
|
||||
@@ -1,9 +1,9 @@
|
||||
from common.extensions import minio_client
|
||||
from .transcription_processor import TranscriptionProcessor
|
||||
from .transcription_processor import TranscriptionBaseProcessor
|
||||
import re
|
||||
|
||||
|
||||
class SRTProcessor(TranscriptionProcessor):
|
||||
class SRTProcessor(TranscriptionBaseProcessor):
|
||||
def _get_transcription(self):
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
@@ -5,15 +5,15 @@ from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class TranscriptionProcessor(Processor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
super().__init__(tenant, model_variables, document_version)
|
||||
self.chunk_size = model_variables['processing_chunk_size']
|
||||
self.chunk_overlap = model_variables['processing_chunk_overlap']
|
||||
class TranscriptionBaseProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||
self.annotation_chunk_size = model_variables.annotation_chunk_length
|
||||
self.annotation_chunk_overlap = 0
|
||||
|
||||
def process(self):
|
||||
self._log("Starting Transcription processing")
|
||||
@@ -37,17 +37,17 @@ class TranscriptionProcessor(Processor):
|
||||
|
||||
def _chunk_transcription(self, transcription):
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
chunk_size=self.annotation_chunk_size,
|
||||
chunk_overlap=self.annotation_chunk_overlap,
|
||||
length_function=len,
|
||||
separators=["\n\n", "\n", " ", ""]
|
||||
)
|
||||
return text_splitter.split_text(transcription)
|
||||
|
||||
def _process_chunks(self, chunks):
|
||||
self._log("Generating markdown from transcription")
|
||||
llm = self.model_variables['llm']
|
||||
template = self.model_variables['transcript_template']
|
||||
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
|
||||
llm = self.model_variables.get_llm()
|
||||
template = self.model_variables.get_template('transcript')
|
||||
language_template = create_language_template(template, self.document_version.language)
|
||||
transcript_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
@@ -58,14 +58,18 @@ class TranscriptionProcessor(Processor):
|
||||
markdown_chunks = []
|
||||
previous_part = ""
|
||||
for i, chunk in enumerate(chunks):
|
||||
self._log(f"Processing chunk {i + 1} of {len(chunks)}")
|
||||
self._log(f"Previous part: {previous_part}")
|
||||
input_transcript = {
|
||||
'transcript': chunk,
|
||||
'previous_part': previous_part
|
||||
}
|
||||
markdown = chain.invoke(input_transcript)
|
||||
markdown = self._clean_markdown(markdown)
|
||||
self._log_tuning("_process_chunks", {
|
||||
"Chunk Number": f"{i + 1} of {len(chunks)}",
|
||||
"Chunk": chunk,
|
||||
"Previous Chunk": previous_part,
|
||||
"Markdown": markdown,
|
||||
})
|
||||
markdown_chunks.append(markdown)
|
||||
|
||||
# Extract the last part for the next iteration
|
||||
@@ -10,22 +10,20 @@ from langchain_core.exceptions import LangChainException
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
from common.extensions import db, minio_client
|
||||
from common.models.document import DocumentVersion, Embedding, Document
|
||||
from common.extensions import db, minio_client, template_manager
|
||||
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
|
||||
from common.models.user import Tenant
|
||||
from common.utils.celery_utils import current_celery
|
||||
from common.utils.database import Database
|
||||
from common.utils.model_utils import select_model_variables, create_language_template
|
||||
from common.utils.os_utils import safe_remove, sync_folder
|
||||
from eveai_workers.Processors.audio_processor import AudioProcessor
|
||||
from eveai_workers.Processors.html_processor import HTMLProcessor
|
||||
from eveai_workers.Processors.pdf_processor import PDFProcessor
|
||||
from eveai_workers.Processors.srt_processor import SRTProcessor
|
||||
from common.utils.model_utils import create_language_template, get_model_variables
|
||||
|
||||
from common.utils.business_event import BusinessEvent
|
||||
from common.utils.business_event_context import current_event
|
||||
from config.processor_types import PROCESSOR_TYPES
|
||||
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
||||
|
||||
|
||||
# Healthcheck task
|
||||
@@ -53,14 +51,18 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
# Retrieve the Catalog ID
|
||||
doc = Document.query.get_or_404(document_version.doc_id)
|
||||
catalog_id = doc.catalog_id
|
||||
catalog = Catalog.query.get_or_404(catalog_id)
|
||||
|
||||
# Select variables to work with depending on tenant and model
|
||||
model_variables = select_model_variables(tenant, catalog_id=catalog_id)
|
||||
current_app.logger.debug(f'Model variables: {model_variables}')
|
||||
model_variables = get_model_variables(tenant_id)
|
||||
|
||||
# Define processor related information
|
||||
processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
|
||||
processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Create Embeddings request received '
|
||||
f'for non existing document version {document_version_id} '
|
||||
f'for badly configured document version {document_version_id} '
|
||||
f'for tenant {tenant_id}, '
|
||||
f'error: {e}')
|
||||
raise
|
||||
@@ -90,19 +92,19 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
delete_embeddings_for_document_version(document_version)
|
||||
|
||||
try:
|
||||
match document_version.file_type:
|
||||
case 'pdf':
|
||||
process_pdf(tenant, model_variables, document_version)
|
||||
case 'html':
|
||||
process_html(tenant, model_variables, document_version)
|
||||
case 'srt':
|
||||
process_srt(tenant, model_variables, document_version)
|
||||
case 'mp4' | 'mp3' | 'ogg':
|
||||
process_audio(tenant, model_variables, document_version)
|
||||
case _:
|
||||
raise Exception(f'No functionality defined for file type {document_version.file_type} '
|
||||
f'for tenant {tenant_id} '
|
||||
f'while creating embeddings for document version {document_version_id}')
|
||||
with current_event.create_span(f"{processor_type} Processing"):
|
||||
document_processor = processor_class(
|
||||
tenant=tenant,
|
||||
model_variables=model_variables,
|
||||
document_version=document_version,
|
||||
catalog=catalog,
|
||||
processor=processor
|
||||
)
|
||||
markdown, title = document_processor.process()
|
||||
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
|
||||
|
||||
current_event.log("Finished Embedding Creation Task")
|
||||
|
||||
except Exception as e:
|
||||
@@ -129,53 +131,12 @@ def delete_embeddings_for_document_version(document_version):
|
||||
raise
|
||||
|
||||
|
||||
def process_pdf(tenant, model_variables, document_version):
|
||||
with current_event.create_span("PDF Processing"):
|
||||
processor = PDFProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def process_html(tenant, model_variables, document_version):
|
||||
with current_event.create_span("HTML Processing"):
|
||||
processor = HTMLProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def process_audio(tenant, model_variables, document_version):
|
||||
with current_event.create_span("Audio Processing"):
|
||||
processor = AudioProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def process_srt(tenant, model_variables, document_version):
|
||||
with current_event.create_span("SRT Processing"):
|
||||
processor = SRTProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def embed_markdown(tenant, model_variables, document_version, markdown, title):
|
||||
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
|
||||
# Create potential chunks
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
|
||||
|
||||
# Combine chunks for embedding
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
|
||||
|
||||
# Enrich chunks
|
||||
with current_event.create_span("Enrich Chunks"):
|
||||
@@ -203,9 +164,6 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):
|
||||
|
||||
|
||||
def enrich_chunks(tenant, model_variables, document_version, title, chunks):
|
||||
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
|
||||
summary = ''
|
||||
if len(chunks) > 1:
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
@@ -233,18 +191,13 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
|
||||
enriched_chunk = f'{chunk_total_context}\n{chunk}'
|
||||
enriched_chunks.append(enriched_chunk)
|
||||
|
||||
current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
|
||||
return enriched_chunks
|
||||
|
||||
|
||||
def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
current_event.log("Starting Summarizing Chunk")
|
||||
current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
llm = model_variables['llm']
|
||||
template = model_variables['summary_template']
|
||||
llm = model_variables.get_llm()
|
||||
template = model_variables.get_template("summary")
|
||||
language_template = create_language_template(template, document_version.language)
|
||||
summary_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
@@ -253,11 +206,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
chain = setup | summary_prompt | llm | output_parser
|
||||
|
||||
try:
|
||||
current_app.logger.debug(f'Starting summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
summary = chain.invoke({"text": chunk})
|
||||
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}.')
|
||||
current_event.log("Finished Summarizing Chunk")
|
||||
return summary
|
||||
except LangChainException as e:
|
||||
@@ -268,14 +217,10 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
|
||||
|
||||
def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
embedding_model = model_variables['embedding_model']
|
||||
embedding_model = model_variables.embedding_model
|
||||
|
||||
try:
|
||||
embeddings = embedding_model.embed_documents(chunks)
|
||||
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
except LangChainException as e:
|
||||
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
|
||||
f'on document version {document_version.id} while calling OpenAI API'
|
||||
@@ -285,28 +230,16 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
# Add embeddings to the database
|
||||
new_embeddings = []
|
||||
for chunk, embedding in zip(chunks, embeddings):
|
||||
new_embedding = model_variables['embedding_db_model']()
|
||||
new_embedding = model_variables.embedding_model_class()
|
||||
new_embedding.document_version = document_version
|
||||
new_embedding.active = True
|
||||
new_embedding.chunk = chunk
|
||||
new_embedding.embedding = embedding
|
||||
new_embeddings.append(new_embedding)
|
||||
|
||||
current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} ')
|
||||
|
||||
return new_embeddings
|
||||
|
||||
|
||||
def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
|
||||
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
|
||||
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
|
||||
|
||||
|
||||
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
|
||||
try:
|
||||
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
|
||||
@@ -328,7 +261,6 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||
|
||||
current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
|
||||
return potential_chunks
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
|
||||
@@ -361,3 +293,69 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
actual_chunks.append(current_chunk)
|
||||
|
||||
return actual_chunks
|
||||
|
||||
|
||||
def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: str = None) -> Processor:
|
||||
"""
|
||||
Get the appropriate processor for a document based on catalog_id, file_type and optional sub_file_type.
|
||||
|
||||
Args:
|
||||
catalog_id: ID of the catalog
|
||||
file_type: Type of file (e.g., 'pdf', 'html')
|
||||
sub_file_type: Optional sub-type for specialized processing
|
||||
|
||||
Returns:
|
||||
Processor instance
|
||||
|
||||
Raises:
|
||||
ValueError: If no matching processor is found
|
||||
"""
|
||||
try:
|
||||
# Start with base query for catalog
|
||||
query = Processor.query.filter_by(catalog_id=catalog_id)
|
||||
|
||||
# Find processor type that handles this file type
|
||||
matching_processor_type = None
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
|
||||
if file_type in supported_types:
|
||||
matching_processor_type = proc_type
|
||||
break
|
||||
|
||||
if not matching_processor_type:
|
||||
raise ValueError(f"No processor type found for file type: {file_type}")
|
||||
|
||||
# Add processor type condition
|
||||
query = query.filter_by(type=matching_processor_type)
|
||||
|
||||
# If sub_file_type is provided, add that condition
|
||||
if sub_file_type:
|
||||
query = query.filter_by(sub_file_type=sub_file_type)
|
||||
else:
|
||||
# If no sub_file_type, prefer processors without sub_file_type specification
|
||||
query = query.filter(or_(Processor.sub_file_type.is_(None),
|
||||
Processor.sub_file_type == ''))
|
||||
|
||||
# Get the first matching processor
|
||||
processor = query.first()
|
||||
|
||||
if not processor:
|
||||
if sub_file_type:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
|
||||
f"file type {file_type}, sub-type {sub_file_type}"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"No processor found for catalog {catalog_id}, "
|
||||
f"file type {file_type}"
|
||||
)
|
||||
|
||||
return processor
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Error finding processor: {str(e)}")
|
||||
raise
|
||||
|
||||
Reference in New Issue
Block a user