- Introduction of dynamic Retrievers & Specialists

- Introduction of dynamic Processors - Introduction of caching system - Introduction of a better template manager - Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists - Start adaptation of chat client
2024-11-15 10:00:53 +01:00
parent 55a8a95f79
commit 1807435339
101 changed files with 4181 additions and 1764 deletions
--- a/eveai_workers/init.py
+++ b/eveai_workers/init.py
@@ -4,10 +4,12 @@ from flask import Flask
 import os

 from common.utils.celery_utils import make_celery, init_celery
-from common.extensions import db, minio_client
-from config.logging_config import LOGGING
+from common.extensions import db, minio_client, template_manager, cache_manager
+import config.logging_config as logging_config
 from config.config import get_config

+from . import processors
+

 def create_app(config_file=None):
    app = Flask(__name__)
@@ -22,8 +24,7 @@ def create_app(config_file=None):
        case _:
            app.config.from_object(get_config('dev'))

-    logging.config.dictConfig(LOGGING)
-    app.embed_tuning_logger = logging.getLogger('embed_tuning')
+    logging.config.dictConfig(logging_config.LOGGING)

    register_extensions(app)

@@ -41,6 +42,8 @@ def create_app(config_file=None):
 def register_extensions(app):
    db.init_app(app)
    minio_client.init_app(app)
+    cache_manager.init_app(app)
+    template_manager.init_app(app)


 app, celery = create_app()
--- a/eveai_workers/processors/init.py
+++ b/eveai_workers/processors/init.py
@@ -0,0 +1,5 @@
+# Import all processor implementations to ensure registration
+from . import audio_processor, html_processor, pdf_processor
+
+# List of all available processor implementations
+__all__ = ['audio_processor', 'html_processor', 'pdf_processor']
--- a/eveai_workers/processors/audio_processor.py
+++ b/eveai_workers/processors/audio_processor.py
@@ -8,20 +8,20 @@ import tempfile
 from common.extensions import minio_client
 import subprocess

-from .transcription_processor import TranscriptionProcessor
+from .processor_registry import ProcessorRegistry
+from .transcription_processor import TranscriptionBaseProcessor
 from common.utils.business_event_context import current_event


-class AudioProcessor(TranscriptionProcessor):
-    def __init__(self, tenant, model_variables, document_version):
-        super().__init__(tenant, model_variables, document_version)
-        self.transcription_client = model_variables['transcription_client']
-        self.transcription_model = model_variables['transcription_model']
+class AudioProcessor(TranscriptionBaseProcessor):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        super().__init__(tenant, model_variables, document_version, catalog, processor)
+        self.transcription_model = model_variables.transcription_model
        self.ffmpeg_path = 'ffmpeg'
-        self.max_compression_duration = model_variables['max_compression_duration']
-        self.max_transcription_duration = model_variables['max_transcription_duration']
-        self.compression_cpu_limit = model_variables.get('compression_cpu_limit', 50)  # CPU usage limit in percentage
-        self.compression_process_delay = model_variables.get('compression_process_delay', 0.1)  # Delay between processing chunks in seconds
+        self.max_compression_duration = model_variables.max_compression_duration
+        self.max_transcription_duration = model_variables.max_transcription_duration
+        self.compression_cpu_limit = model_variables.compression_cpu_limit  # CPU usage limit in percentage
+        self.compression_process_delay = model_variables.compression_process_delay  # Delay between processing chunks in seconds
        self.file_type = document_version.file_type

    def _get_transcription(self):
@@ -39,26 +39,25 @@ class AudioProcessor(TranscriptionProcessor):
        return transcription

    def _compress_audio(self, audio_data):
-        self._log("Compressing audio")
-
        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_file:
            temp_file.write(audio_data)
            temp_file_path = temp_file.name

        try:
-            self._log("Creating AudioSegment from file")
            audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
-            self._log("Finished creating AudioSegment from file")
            total_duration = len(audio_info)
-            self._log(f"Audio duration: {total_duration / 1000} seconds")
-
+            self._log_tuning("_compress_audio", {
+                "Audio Duration (ms)": total_duration,
+            })
            segment_length = self.max_compression_duration * 1000  # Convert to milliseconds
            total_chunks = (total_duration + segment_length - 1) // segment_length

            compressed_segments = AudioSegment.empty()

            for i in range(total_chunks):
-                self._log(f"Compressing segment {i + 1} of {total_chunks}")
+                self._log_tuning("_compress_audio", {
+                    "Segment Nr": f"{i + 1} of {total_chunks}"
+                })

                start_time = i * segment_length
                end_time = min((i + 1) * segment_length, total_duration)
@@ -88,7 +87,9 @@ class AudioProcessor(TranscriptionProcessor):
                    compressed_filename,
                    compressed_buffer.read()
                )
-            self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
+            self._log_tuning("_compress_audio", {
+                "Compressed audio to MinIO": compressed_filename
+            })

            return compressed_segments

@@ -131,7 +132,6 @@ class AudioProcessor(TranscriptionProcessor):
        return compressed_segment

    def _transcribe_audio(self, audio_data):
-        self._log("Starting audio transcription")
        # audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
        audio = audio_data

@@ -140,7 +140,6 @@ class AudioProcessor(TranscriptionProcessor):
        total_chunks = len(audio) // segment_length + 1

        for i, chunk in enumerate(audio[::segment_length]):
-            self._log(f'Processing chunk {i + 1} of {total_chunks}')
            segment_duration = 0
            if i == total_chunks - 1:
                segment_duration = (len(audio) % segment_length) // 1000
@@ -153,37 +152,34 @@ class AudioProcessor(TranscriptionProcessor):

                try:
                    file_size = os.path.getsize(temp_audio.name)
-                    self._log(f"Temporary audio file size: {file_size} bytes")

                    with open(temp_audio.name, 'rb') as audio_file:
-                        file_start = audio_file.read(100)
-                        self._log(f"First 100 bytes of audio file: {file_start}")
-                        audio_file.seek(0)  # Reset file pointer to the beginning
-
-                        self._log("Calling transcription API")
-                        transcription = self.model_variables.transcribe(
+                        transcription = self.model_variables.transcription_model.transcribe(
                            file=audio_file,
-                            model=self.transcription_model,
                            language=self.document_version.language,
                            response_format='verbose_json',
-                            duration=segment_duration,
+                            duration=segment_duration
                        )
-                        self._log("Transcription API call completed")
-
                    if transcription:
+                        trans = ""
                        # Handle the transcription result based on its type
                        if isinstance(transcription, str):
-                            self._log(f"Transcription result (string): {transcription[:100]}...")
-                            transcriptions.append(transcription)
+                            trans = transcription
                        elif hasattr(transcription, 'text'):
-                            self._log(
-                                f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
-                            transcriptions.append(transcription.text)
+                            trans = transcription.text
                        else:
-                            self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
                            transcriptions.append(str(transcription))
+
+                        transcriptions.append(trans)
+
+                        self._log_tuning("_transcribe_audio", {
+                            "Chunk Nr": f"{i + 1} of {total_chunks}",
+                            "Segment Duration": segment_duration,
+                            "Transcription": trans,
+                        })
                    else:
                        self._log("Warning: Received empty transcription", level='warning')
+                        self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})

                except Exception as e:
                    self._log(f"Error during transcription: {str(e)}", level='error')
@@ -206,7 +202,10 @@ class AudioProcessor(TranscriptionProcessor):
            transcription_filename,
            full_transcription.encode('utf-8')
        )
-        self._log(f"Saved transcription to MinIO: {transcription_filename}")
+        self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")

        return full_transcription

+
+# Register the processor
+ProcessorRegistry.register("AUDIO_PROCESSOR", AudioProcessor)
--- a/eveai_workers/processors/base_processor.py
+++ b/eveai_workers/processors/base_processor.py
@@ -1,14 +1,42 @@
 from abc import ABC, abstractmethod
+from typing import Dict, Any
+
 from flask import current_app
 from common.extensions import minio_client
+from config.logging_config import TuningLogger


-class Processor(ABC):
-    def __init__(self, tenant, model_variables, document_version):
+class BaseProcessor(ABC):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
        self.tenant = tenant
        self.model_variables = model_variables
        self.document_version = document_version
-        self.embed_tuning = model_variables['embed_tuning']
+        self.catalog = catalog
+        self.processor = processor
+        self.tuning = processor.tuning if processor else False
+        self.tuning_logger = None
+        self._setup_tuning_logger()
+
+        self._log_tuning("Processor initialized", {
+            "processor_type": processor.type if processor else None,
+            "document_version": document_version.id if document_version else None,
+            "catalog": catalog.id if catalog else None
+        })
+
+    def _setup_tuning_logger(self):
+        try:
+            self.tuning_logger = TuningLogger(
+                'tuning',
+                tenant_id=self.tenant.id if self.tenant else None,
+                catalog_id=self.catalog.id if self.catalog else None,
+                processor_id=self.processor.id if self.processor else None,
+            )
+            # Verify logger is working with a test message
+            if self.tuning:
+                self.tuning_logger.log_tuning('processor', "Tuning logger initialized")
+        except Exception as e:
+            current_app.logger.error(f"Failed to setup tuning logger: {str(e)}")
+            raise

    @abstractmethod
    def process(self):
@@ -50,3 +78,11 @@ class Processor(ABC):

        return markdown

+    def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
+        if self.tuning and self.tuning_logger:
+            try:
+                self.tuning_logger.log_tuning('processor', message, data)
+            except Exception as e:
+                current_app.logger.error(f"Processor: Error in tuning logging: {e}")
+
+
--- a/eveai_workers/processors/html_processor.py
+++ b/eveai_workers/processors/html_processor.py
@@ -4,21 +4,34 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 from common.extensions import db, minio_client
 from common.utils.model_utils import create_language_template
-from .processor import Processor
+from .base_processor import BaseProcessor
 from common.utils.business_event_context import current_event
+from .processor_registry import ProcessorRegistry
+from common.utils.string_list_converter import StringListConverter as SLC


-class HTMLProcessor(Processor):
-    def __init__(self, tenant, model_variables, document_version):
-        super().__init__(tenant, model_variables, document_version)
-        self.html_tags = model_variables['html_tags']
-        self.html_end_tags = model_variables['html_end_tags']
-        self.html_included_elements = model_variables['html_included_elements']
-        self.html_excluded_elements = model_variables['html_excluded_elements']
-        self.html_excluded_classes = model_variables['html_excluded_classes']
-        self.chunk_size = model_variables['processing_chunk_size']  # Adjust this based on your LLM's optimal input size
-        self.chunk_overlap = model_variables[
-            'processing_chunk_overlap']  # Adjust for context preservation between chunks
+class HTMLProcessor(BaseProcessor):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        super().__init__(tenant, model_variables, document_version, catalog, processor)
+        cat_conf = catalog.configuration
+        proc_conf = processor.configuration
+        self.html_tags = SLC.string_to_list(proc_conf['html_tags'])
+        self.html_end_tags = SLC.string_to_list(proc_conf['html_end_tags'])
+        self.html_included_elements = SLC.string_to_list(proc_conf['html_included_elements'])
+        self.html_excluded_elements = SLC.string_to_list(proc_conf['html_excluded_elements'])
+        self.html_excluded_classes = SLC.string_to_list(proc_conf['html_excluded_classes'])
+        self.tuning = self.processor.tuning
+        # Add verification logging
+        self._log(f"HTML Processor initialized with tuning={self.tuning}")
+        if self.tuning:
+            self._log_tuning("HTML Processor initialized", {
+                "html_tags": self.html_tags,
+                "html_end_tags": self.html_end_tags,
+                "included_elements": self.html_included_elements,
+                "excluded_elements": self.html_excluded_elements
+            })
+
+        self.chunk_size = catalog.max_chunk_size

    def process(self):
        self._log("Starting HTML processing")
@@ -62,13 +75,14 @@ class HTMLProcessor(Processor):
        title = soup.find('title').get_text(strip=True) if soup.find('title') else ''

        self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
+        self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
        return extracted_html, title

    def _generate_markdown_from_html(self, html_content):
        self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')

-        llm = self.model_variables['llm']
-        template = self.model_variables['html_parse_template']
+        llm = self.model_variables.get_llm()
+        template = self.model_variables.get_template("html_parse")
        parse_prompt = ChatPromptTemplate.from_template(template)
        setup = RunnablePassthrough()
        output_parser = StrOutputParser()
@@ -79,13 +93,10 @@ class HTMLProcessor(Processor):

        markdown_chunks = []
        for chunk in chunks:
-            if self.embed_tuning:
-                self._log(f'Processing chunk: \n{chunk}\n')
            input_html = {"html": chunk}
            markdown_chunk = chain.invoke(input_html)
            markdown_chunks.append(markdown_chunk)
-            if self.embed_tuning:
-                self._log(f'Processed markdown chunk: \n{markdown_chunk}\n')
+            self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})

        markdown = "\n\n".join(markdown_chunks)
        self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
@@ -146,3 +157,7 @@ class HTMLProcessor(Processor):
    def _extract_element_content(self, element):
        content = ' '.join(child.strip() for child in element.stripped_strings)
        return f'<{element.name}>{content}</{element.name}>\n'
+
+
+# Register the processor
+ProcessorRegistry.register("HTML_PROCESSOR", HTMLProcessor)
--- a/eveai_workers/processors/pdf_processor.py
+++ b/eveai_workers/processors/pdf_processor.py
@@ -9,18 +9,18 @@ from langchain_core.runnables import RunnablePassthrough

 from common.extensions import minio_client
 from common.utils.model_utils import create_language_template
-from .processor import Processor
+from .base_processor import BaseProcessor
 from common.utils.business_event_context import current_event
+from .processor_registry import ProcessorRegistry


-class PDFProcessor(Processor):
-    def __init__(self, tenant, model_variables, document_version):
-        super().__init__(tenant, model_variables, document_version)
-        # PDF-specific initialization
-        self.chunk_size = model_variables['processing_chunk_size']
-        self.chunk_overlap = model_variables['processing_chunk_overlap']
-        self.min_chunk_size = model_variables['processing_min_chunk_size']
-        self.max_chunk_size = model_variables['processing_max_chunk_size']
+class PDFProcessor(BaseProcessor):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        super().__init__(tenant, model_variables, document_version, catalog, processor)
+
+        self.chunk_size = catalog.max_chunk_size
+        self.chunk_overlap = 0
+        self.tuning = self.processor.tuning

    def process(self):
        self._log("Starting PDF processing")
@@ -38,7 +38,8 @@ class PDFProcessor(Processor):
            with current_event.create_span("Markdown Generation"):
                llm_chunks = self._split_content_for_llm(structured_content)
                markdown = self._process_chunks_with_llm(llm_chunks)
-                self._save_markdown(markdown)
+
+            self._save_markdown(markdown)
            self._log("Finished processing PDF")
            return markdown, title
        except Exception as e:
@@ -56,19 +57,10 @@ class PDFProcessor(Processor):
                    'figures': self._extract_figures(page, page_num, figure_counter),
                    'tables': self._extract_tables(page)
                }
-                if self.embed_tuning:
-                    self._log(f'Extracted PDF Content for page {page_num + 1}')
-                    self._log(f"{page_content }")
+                self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
                figure_counter += len(page_content['figures'])
                extracted_content.append(page_content)

-            # if self.embed_tuning:
-            #     current_app.embed_tuning_logger.debug(f'Extracted PDF Content')
-            #     current_app.embed_tuning_logger.debug(f'---------------------')
-            #     current_app.embed_tuning_logger.debug(f'Page: {page_content}')
-            #     current_app.embed_tuning_logger.debug(f'End of Extracted PDF Content')
-            #     current_app.embed_tuning_logger.debug(f'----------------------------')
-
        return extracted_content

    def _extract_figures(self, page, page_num, figure_counter):
@@ -127,6 +119,7 @@ class PDFProcessor(Processor):
                    markdown_table = self._table_to_markdown(table)
                    if markdown_table:  # Only add non-empty tables
                        tables.append(markdown_table)
+                        self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
        except Exception as e:
            self._log(f"Error extracting tables from page: {str(e)}", level='error')
        return tables
@@ -202,7 +195,7 @@ class PDFProcessor(Processor):
                for table in page['tables']:
                    structured_content += f"\n{table}\n"

-        if self.embed_tuning:
+        if self.tuning:
            self._save_intermediate(structured_content, "structured_content.md")

        return structured_content, title
@@ -217,8 +210,8 @@ class PDFProcessor(Processor):
        return text_splitter.split_text(content)

    def _process_chunks_with_llm(self, chunks):
-        llm = self.model_variables['llm']
-        template = self.model_variables['pdf_parse_template']
+        llm = self.model_variables.get_llm()
+        template = self.model_variables.get_template('pdf_parse')
        pdf_prompt = ChatPromptTemplate.from_template(template)
        setup = RunnablePassthrough()
        output_parser = StrOutputParser()
@@ -232,3 +225,7 @@ class PDFProcessor(Processor):
            markdown_chunks.append(result)

        return "\n\n".join(markdown_chunks)
+
+
+# Register the processor
+ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)
--- a/eveai_workers/processors/processor_registry.py
+++ b/eveai_workers/processors/processor_registry.py
@@ -0,0 +1,92 @@
+from typing import Dict, Type, Optional
+from flask import current_app
+from config.processor_types import PROCESSOR_TYPES
+from .base_processor import BaseProcessor
+
+
+class ProcessorRegistry:
+    """Registry for processor types that aligns with PROCESSOR_TYPES configuration"""
+
+    _registry: Dict[str, Type[BaseProcessor]] = {}
+
+    @classmethod
+    def register(cls, processor_type: str, processor_class: Type[BaseProcessor]):
+        """
+        Register a new processor type that must match a type in PROCESSOR_TYPES
+
+        Args:
+            processor_type: Type identifier from PROCESSOR_TYPES
+            processor_class: Processor implementation class
+
+        Raises:
+            ValueError: If processor_type isn't defined in PROCESSOR_TYPES
+        """
+        if processor_type not in PROCESSOR_TYPES:
+            raise ValueError(f"Processor type {processor_type} not found in PROCESSOR_TYPES configuration")
+
+        cls._registry[processor_type] = processor_class
+
+    @classmethod
+    def get_processor_class(cls, processor_type: str) -> Type[BaseProcessor]:
+        """
+        Get the processor class for a given processor type
+
+        Args:
+            processor_type: Type identifier from PROCESSOR_TYPES
+
+        Returns:
+            The registered processor class
+
+        Raises:
+            ValueError: If no processor is registered for the given type
+        """
+        if processor_type not in cls._registry:
+            raise ValueError(f"No processor registered for type: {processor_type}")
+        return cls._registry[processor_type]
+
+    @classmethod
+    def get_processor_for_file_type(cls, file_type: str) -> tuple[str, Type[BaseProcessor]]:
+        """
+        Find appropriate processor for a file type by checking PROCESSOR_TYPES definitions
+
+        Args:
+            file_type: File extension (e.g., 'html', 'pdf')
+
+        Returns:
+            Tuple of (processor_type, processor_class)
+
+        Raises:
+            ValueError: If no processor is found for the file type
+        """
+        # First find which processor type handles this file type
+        for proc_type, config in PROCESSOR_TYPES.items():
+            # Check if file_type is in the supported file_types (handling both string and list formats)
+            supported_types = config['file_types']
+            if isinstance(supported_types, str):
+                supported_types = [t.strip() for t in supported_types.split(',')]
+
+            if file_type in supported_types:
+                # Get the registered processor class for this type
+                if proc_type in cls._registry:
+                    return proc_type, cls._registry[proc_type]
+                else:
+                    raise ValueError(
+                        f"Found processor type {proc_type} for file type {file_type} but no processor is registered")
+
+        raise ValueError(f"No processor type found for file type: {file_type}")
+
+    @classmethod
+    def validate_processor_registration(cls):
+        """
+        Validate that all PROCESSOR_TYPES have registered processors
+
+        Raises:
+            ValueError: If any processor type lacks a registered processor
+        """
+        missing_processors = []
+        for proc_type in PROCESSOR_TYPES.keys():
+            if proc_type not in cls._registry:
+                missing_processors.append(proc_type)
+
+        if missing_processors:
+            raise ValueError(f"Missing processor registrations for: {', '.join(missing_processors)}")
--- a/eveai_workers/processors/srt_processor.py
+++ b/eveai_workers/processors/srt_processor.py
@@ -1,9 +1,9 @@
 from common.extensions import minio_client
-from .transcription_processor import TranscriptionProcessor
+from .transcription_processor import TranscriptionBaseProcessor
 import re


-class SRTProcessor(TranscriptionProcessor):
+class SRTProcessor(TranscriptionBaseProcessor):
    def _get_transcription(self):
        file_data = minio_client.download_document_file(
            self.tenant.id,
--- a/eveai_workers/processors/transcription_processor.py
+++ b/eveai_workers/processors/transcription_processor.py
@@ -5,15 +5,15 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough

 from common.utils.model_utils import create_language_template
-from .processor import Processor
+from .base_processor import BaseProcessor
 from common.utils.business_event_context import current_event


-class TranscriptionProcessor(Processor):
-    def __init__(self, tenant, model_variables, document_version):
-        super().__init__(tenant, model_variables, document_version)
-        self.chunk_size = model_variables['processing_chunk_size']
-        self.chunk_overlap = model_variables['processing_chunk_overlap']
+class TranscriptionBaseProcessor(BaseProcessor):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        super().__init__(tenant, model_variables, document_version, catalog, processor)
+        self.annotation_chunk_size = model_variables.annotation_chunk_length
+        self.annotation_chunk_overlap = 0

    def process(self):
        self._log("Starting Transcription processing")
@@ -37,17 +37,17 @@ class TranscriptionProcessor(Processor):

    def _chunk_transcription(self, transcription):
        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=self.chunk_size,
-            chunk_overlap=self.chunk_overlap,
+            chunk_size=self.annotation_chunk_size,
+            chunk_overlap=self.annotation_chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        return text_splitter.split_text(transcription)

    def _process_chunks(self, chunks):
-        self._log("Generating markdown from transcription")
-        llm = self.model_variables['llm']
-        template = self.model_variables['transcript_template']
+        self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
+        llm = self.model_variables.get_llm()
+        template = self.model_variables.get_template('transcript')
        language_template = create_language_template(template, self.document_version.language)
        transcript_prompt = ChatPromptTemplate.from_template(language_template)
        setup = RunnablePassthrough()
@@ -58,14 +58,18 @@ class TranscriptionProcessor(Processor):
        markdown_chunks = []
        previous_part = ""
        for i, chunk in enumerate(chunks):
-            self._log(f"Processing chunk {i + 1} of {len(chunks)}")
-            self._log(f"Previous part: {previous_part}")
            input_transcript = {
                'transcript': chunk,
                'previous_part': previous_part
            }
            markdown = chain.invoke(input_transcript)
            markdown = self._clean_markdown(markdown)
+            self._log_tuning("_process_chunks", {
+                "Chunk Number": f"{i + 1} of {len(chunks)}",
+                "Chunk": chunk,
+                "Previous Chunk": previous_part,
+                "Markdown": markdown,
+            })
            markdown_chunks.append(markdown)

            # Extract the last part for the next iteration
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -10,22 +10,20 @@ from langchain_core.exceptions import LangChainException
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
+from sqlalchemy import or_
 from sqlalchemy.exc import SQLAlchemyError

-from common.extensions import db, minio_client
-from common.models.document import DocumentVersion, Embedding, Document
+from common.extensions import db, minio_client, template_manager
+from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
 from common.models.user import Tenant
 from common.utils.celery_utils import current_celery
 from common.utils.database import Database
-from common.utils.model_utils import select_model_variables, create_language_template
-from common.utils.os_utils import safe_remove, sync_folder
-from eveai_workers.Processors.audio_processor import AudioProcessor
-from eveai_workers.Processors.html_processor import HTMLProcessor
-from eveai_workers.Processors.pdf_processor import PDFProcessor
-from eveai_workers.Processors.srt_processor import SRTProcessor
+from common.utils.model_utils import create_language_template, get_model_variables

 from common.utils.business_event import BusinessEvent
 from common.utils.business_event_context import current_event
+from config.processor_types import PROCESSOR_TYPES
+from eveai_workers.processors.processor_registry import ProcessorRegistry


 # Healthcheck task
@@ -53,14 +51,18 @@ def create_embeddings(tenant_id, document_version_id):
        # Retrieve the Catalog ID
        doc = Document.query.get_or_404(document_version.doc_id)
        catalog_id = doc.catalog_id
+        catalog = Catalog.query.get_or_404(catalog_id)

        # Select variables to work with depending on tenant and model
-        model_variables = select_model_variables(tenant, catalog_id=catalog_id)
-        current_app.logger.debug(f'Model variables: {model_variables}')
+        model_variables = get_model_variables(tenant_id)
+
+        # Define processor related information
+        processor_type, processor_class = ProcessorRegistry.get_processor_for_file_type(document_version.file_type)
+        processor = get_processor_for_document(catalog_id, document_version.file_type, document_version.sub_file_type)

    except Exception as e:
        current_app.logger.error(f'Create Embeddings request received '
-                                 f'for non existing document version {document_version_id} '
+                                 f'for badly configured document version {document_version_id} '
                                 f'for tenant {tenant_id}, '
                                 f'error: {e}')
        raise
@@ -90,19 +92,19 @@ def create_embeddings(tenant_id, document_version_id):
        delete_embeddings_for_document_version(document_version)

        try:
-            match document_version.file_type:
-                case 'pdf':
-                    process_pdf(tenant, model_variables, document_version)
-                case 'html':
-                    process_html(tenant, model_variables, document_version)
-                case 'srt':
-                    process_srt(tenant, model_variables, document_version)
-                case 'mp4' | 'mp3' | 'ogg':
-                    process_audio(tenant, model_variables, document_version)
-                case _:
-                    raise Exception(f'No functionality defined for file type {document_version.file_type} '
-                                    f'for tenant {tenant_id} '
-                                    f'while creating embeddings for document version {document_version_id}')
+            with current_event.create_span(f"{processor_type} Processing"):
+                document_processor = processor_class(
+                    tenant=tenant,
+                    model_variables=model_variables,
+                    document_version=document_version,
+                    catalog=catalog,
+                    processor=processor
+                )
+                markdown, title = document_processor.process()
+
+            with current_event.create_span("Embedding"):
+                embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
+
            current_event.log("Finished Embedding Creation Task")

        except Exception as e:
@@ -129,53 +131,12 @@ def delete_embeddings_for_document_version(document_version):
        raise


-def process_pdf(tenant, model_variables, document_version):
-    with current_event.create_span("PDF Processing"):
-        processor = PDFProcessor(tenant, model_variables, document_version)
-        markdown, title = processor.process()
-
-    # Process markdown and embed
-    with current_event.create_span("Embedding"):
-        embed_markdown(tenant, model_variables, document_version, markdown, title)
-
-
-def process_html(tenant, model_variables, document_version):
-    with current_event.create_span("HTML Processing"):
-        processor = HTMLProcessor(tenant, model_variables, document_version)
-        markdown, title = processor.process()
-
-    # Process markdown and embed
-    with current_event.create_span("Embedding"):
-        embed_markdown(tenant, model_variables, document_version, markdown, title)
-
-
-def process_audio(tenant, model_variables, document_version):
-    with current_event.create_span("Audio Processing"):
-        processor = AudioProcessor(tenant, model_variables, document_version)
-        markdown, title = processor.process()
-
-    # Process markdown and embed
-    with current_event.create_span("Embedding"):
-        embed_markdown(tenant, model_variables, document_version, markdown, title)
-
-
-def process_srt(tenant, model_variables, document_version):
-    with current_event.create_span("SRT Processing"):
-        processor = SRTProcessor(tenant, model_variables, document_version)
-        markdown, title = processor.process()
-
-    # Process markdown and embed
-    with current_event.create_span("Embedding"):
-        embed_markdown(tenant, model_variables, document_version, markdown, title)
-
-
-def embed_markdown(tenant, model_variables, document_version, markdown, title):
+def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
    # Create potential chunks
    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")

    # Combine chunks for embedding
-    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
-                                         model_variables['max_chunk_size'])
+    chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)

    # Enrich chunks
    with current_event.create_span("Enrich Chunks"):
@@ -203,9 +164,6 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):


 def enrich_chunks(tenant, model_variables, document_version, title, chunks):
-    current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-
    summary = ''
    if len(chunks) > 1:
        summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
@@ -233,18 +191,13 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
        enriched_chunk = f'{chunk_total_context}\n{chunk}'
        enriched_chunks.append(enriched_chunk)

-    current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-
    return enriched_chunks


 def summarize_chunk(tenant, model_variables, document_version, chunk):
    current_event.log("Starting Summarizing Chunk")
-    current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-    llm = model_variables['llm']
-    template = model_variables['summary_template']
+    llm = model_variables.get_llm()
+    template = model_variables.get_template("summary")
    language_template = create_language_template(template, document_version.language)
    summary_prompt = ChatPromptTemplate.from_template(language_template)
    setup = RunnablePassthrough()
@@ -253,11 +206,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
    chain = setup | summary_prompt | llm | output_parser

    try:
-        current_app.logger.debug(f'Starting summarizing chunk for tenant {tenant.id} '
-                                 f'on document version {document_version.id}')
        summary = chain.invoke({"text": chunk})
-        current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
-                                 f'on document version {document_version.id}.')
        current_event.log("Finished Summarizing Chunk")
        return summary
    except LangChainException as e:
@@ -268,14 +217,10 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):


 def embed_chunks(tenant, model_variables, document_version, chunks):
-    current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-    embedding_model = model_variables['embedding_model']
+    embedding_model = model_variables.embedding_model

    try:
        embeddings = embedding_model.embed_documents(chunks)
-        current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} '
-                                 f'on document version {document_version.id}')
    except LangChainException as e:
        current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
                                 f'on document version {document_version.id} while calling OpenAI API'
@@ -285,28 +230,16 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
    # Add embeddings to the database
    new_embeddings = []
    for chunk, embedding in zip(chunks, embeddings):
-        new_embedding = model_variables['embedding_db_model']()
+        new_embedding = model_variables.embedding_model_class()
        new_embedding.document_version = document_version
        new_embedding.active = True
        new_embedding.chunk = chunk
        new_embedding.embedding = embedding
        new_embeddings.append(new_embedding)

-    current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} ')
-
    return new_embeddings


-def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
-    if tenant.embed_tuning:
-        current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
-        current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
-        current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
-        current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
-        current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
-        current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
-
-
 def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
    try:
        current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
@@ -328,7 +261,6 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
        md_header_splits = markdown_splitter.split_text(markdown)
        potential_chunks = [doc.page_content for doc in md_header_splits]

-        current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
        return potential_chunks
    except Exception as e:
        current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
@@ -361,3 +293,69 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
        actual_chunks.append(current_chunk)

    return actual_chunks
+
+
+def get_processor_for_document(catalog_id: int, file_type: str, sub_file_type: str = None) -> Processor:
+    """
+    Get the appropriate processor for a document based on catalog_id, file_type and optional sub_file_type.
+
+    Args:
+        catalog_id: ID of the catalog
+        file_type: Type of file (e.g., 'pdf', 'html')
+        sub_file_type: Optional sub-type for specialized processing
+
+    Returns:
+        Processor instance
+
+    Raises:
+        ValueError: If no matching processor is found
+    """
+    try:
+        # Start with base query for catalog
+        query = Processor.query.filter_by(catalog_id=catalog_id)
+
+        # Find processor type that handles this file type
+        matching_processor_type = None
+        for proc_type, config in PROCESSOR_TYPES.items():
+            supported_types = config['file_types']
+            if isinstance(supported_types, str):
+                supported_types = [t.strip() for t in supported_types.split(',')]
+
+            if file_type in supported_types:
+                matching_processor_type = proc_type
+                break
+
+        if not matching_processor_type:
+            raise ValueError(f"No processor type found for file type: {file_type}")
+
+        # Add processor type condition
+        query = query.filter_by(type=matching_processor_type)
+
+        # If sub_file_type is provided, add that condition
+        if sub_file_type:
+            query = query.filter_by(sub_file_type=sub_file_type)
+        else:
+            # If no sub_file_type, prefer processors without sub_file_type specification
+            query = query.filter(or_(Processor.sub_file_type.is_(None),
+                                     Processor.sub_file_type == ''))
+
+        # Get the first matching processor
+        processor = query.first()
+
+        if not processor:
+            if sub_file_type:
+                raise ValueError(
+                    f"No processor found for catalog {catalog_id} of type {matching_processor_type}, "
+                    f"file type {file_type}, sub-type {sub_file_type}"
+                )
+            else:
+                raise ValueError(
+                    f"No processor found for catalog {catalog_id}, "
+                    f"file type {file_type}"
+                )
+
+        return processor
+
+    except Exception as e:
+        current_app.logger.error(f"Error finding processor: {str(e)}")
+        raise