- Introduction of dynamic Retrievers & Specialists

- Introduction of dynamic Processors - Introduction of caching system - Introduction of a better template manager - Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists - Start adaptation of chat client
2024-11-15 10:00:53 +01:00
parent 55a8a95f79
commit 1807435339
101 changed files with 4181 additions and 1764 deletions
--- a/eveai_workers/processors/init.py
+++ b/eveai_workers/processors/init.py
@@ -0,0 +1,5 @@
+# Import all processor implementations to ensure registration
+from . import audio_processor, html_processor, pdf_processor
+
+# List of all available processor implementations
+__all__ = ['audio_processor', 'html_processor', 'pdf_processor']
--- a/eveai_workers/processors/audio_processor.py
+++ b/eveai_workers/processors/audio_processor.py
@@ -0,0 +1,211 @@
+import io
+import os
+import time
+
+import psutil
+from pydub import AudioSegment
+import tempfile
+from common.extensions import minio_client
+import subprocess
+
+from .processor_registry import ProcessorRegistry
+from .transcription_processor import TranscriptionBaseProcessor
+from common.utils.business_event_context import current_event
+
+
+class AudioProcessor(TranscriptionBaseProcessor):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        super().__init__(tenant, model_variables, document_version, catalog, processor)
+        self.transcription_model = model_variables.transcription_model
+        self.ffmpeg_path = 'ffmpeg'
+        self.max_compression_duration = model_variables.max_compression_duration
+        self.max_transcription_duration = model_variables.max_transcription_duration
+        self.compression_cpu_limit = model_variables.compression_cpu_limit  # CPU usage limit in percentage
+        self.compression_process_delay = model_variables.compression_process_delay  # Delay between processing chunks in seconds
+        self.file_type = document_version.file_type
+
+    def _get_transcription(self):
+        file_data = minio_client.download_document_file(
+            self.tenant.id,
+            self.document_version.bucket_name,
+            self.document_version.object_name,
+        )
+
+        with current_event.create_span("Audio Compression"):
+            compressed_audio = self._compress_audio(file_data)
+        with current_event.create_span("Audio Transcription"):
+            transcription = self._transcribe_audio(compressed_audio)
+
+        return transcription
+
+    def _compress_audio(self, audio_data):
+        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_file:
+            temp_file.write(audio_data)
+            temp_file_path = temp_file.name
+
+        try:
+            audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
+            total_duration = len(audio_info)
+            self._log_tuning("_compress_audio", {
+                "Audio Duration (ms)": total_duration,
+            })
+            segment_length = self.max_compression_duration * 1000  # Convert to milliseconds
+            total_chunks = (total_duration + segment_length - 1) // segment_length
+
+            compressed_segments = AudioSegment.empty()
+
+            for i in range(total_chunks):
+                self._log_tuning("_compress_audio", {
+                    "Segment Nr": f"{i + 1} of {total_chunks}"
+                })
+
+                start_time = i * segment_length
+                end_time = min((i + 1) * segment_length, total_duration)
+
+                chunk = AudioSegment.from_file(
+                    temp_file_path,
+                    format=self.document_version.file_type,
+                    start_second=start_time / 1000,
+                    duration=(end_time - start_time) / 1000
+                )
+
+                compressed_chunk = self._compress_segment(chunk)
+                compressed_segments += compressed_chunk
+
+                time.sleep(self.compression_process_delay)
+
+            # Save compressed audio to MinIO
+            compressed_filename = f"{self.document_version.id}_compressed.mp3"
+            with io.BytesIO() as compressed_buffer:
+                compressed_segments.export(compressed_buffer, format="mp3")
+                compressed_buffer.seek(0)
+                minio_client.upload_document_file(
+                    self.tenant.id,
+                    self.document_version.doc_id,
+                    self.document_version.language,
+                    self.document_version.id,
+                    compressed_filename,
+                    compressed_buffer.read()
+                )
+            self._log_tuning("_compress_audio", {
+                "Compressed audio to MinIO": compressed_filename
+            })
+
+            return compressed_segments
+
+        except Exception as e:
+            self._log(f"Error during audio processing: {str(e)}", level='error')
+            raise
+        finally:
+            os.unlink(temp_file_path)  # Ensure the temporary file is deleted
+
+    def _compress_segment(self, audio_segment):
+        with io.BytesIO() as segment_buffer:
+            audio_segment.export(segment_buffer, format="wav")
+            segment_buffer.seek(0)
+
+            with io.BytesIO() as output_buffer:
+                command = [
+                    'nice', '-n', '19',
+                    'ffmpeg',
+                    '-i', 'pipe:0',
+                    '-ar', '16000',
+                    '-ac', '1',
+                    '-b:a', '32k',
+                    '-filter:a', 'loudnorm',
+                    '-f', 'mp3',
+                    'pipe:1'
+                ]
+
+                process = psutil.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+                stdout, stderr = process.communicate(input=segment_buffer.read())
+
+                if process.returncode != 0:
+                    self._log(f"FFmpeg error: {stderr.decode()}", level='error')
+                    raise Exception("FFmpeg compression failed")
+
+                output_buffer.write(stdout)
+                output_buffer.seek(0)
+                compressed_segment = AudioSegment.from_mp3(output_buffer)
+
+        return compressed_segment
+
+    def _transcribe_audio(self, audio_data):
+        # audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
+        audio = audio_data
+
+        segment_length = self.max_transcription_duration * 1000  # calculate milliseconds
+        transcriptions = []
+        total_chunks = len(audio) // segment_length + 1
+
+        for i, chunk in enumerate(audio[::segment_length]):
+            segment_duration = 0
+            if i == total_chunks - 1:
+                segment_duration = (len(audio) % segment_length) // 1000
+            else:
+                segment_duration = self.max_transcription_duration
+
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
+                chunk.export(temp_audio.name, format="mp3")
+                temp_audio.flush()
+
+                try:
+                    file_size = os.path.getsize(temp_audio.name)
+
+                    with open(temp_audio.name, 'rb') as audio_file:
+                        transcription = self.model_variables.transcription_model.transcribe(
+                            file=audio_file,
+                            language=self.document_version.language,
+                            response_format='verbose_json',
+                            duration=segment_duration
+                        )
+                    if transcription:
+                        trans = ""
+                        # Handle the transcription result based on its type
+                        if isinstance(transcription, str):
+                            trans = transcription
+                        elif hasattr(transcription, 'text'):
+                            trans = transcription.text
+                        else:
+                            transcriptions.append(str(transcription))
+
+                        transcriptions.append(trans)
+
+                        self._log_tuning("_transcribe_audio", {
+                            "Chunk Nr": f"{i + 1} of {total_chunks}",
+                            "Segment Duration": segment_duration,
+                            "Transcription": trans,
+                        })
+                    else:
+                        self._log("Warning: Received empty transcription", level='warning')
+                        self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
+
+                except Exception as e:
+                    self._log(f"Error during transcription: {str(e)}", level='error')
+                finally:
+                    os.unlink(temp_audio.name)
+
+        full_transcription = " ".join(filter(None, transcriptions))
+
+        if not full_transcription:
+            self._log("Warning: No transcription was generated", level='warning')
+            full_transcription = "No transcription available."
+
+        # Save transcription to MinIO
+        transcription_filename = f"{self.document_version.id}_transcription.txt"
+        minio_client.upload_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            transcription_filename,
+            full_transcription.encode('utf-8')
+        )
+        self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
+
+        return full_transcription
+
+
+# Register the processor
+ProcessorRegistry.register("AUDIO_PROCESSOR", AudioProcessor)
--- a/eveai_workers/processors/base_processor.py
+++ b/eveai_workers/processors/base_processor.py
@@ -0,0 +1,88 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Any
+
+from flask import current_app
+from common.extensions import minio_client
+from config.logging_config import TuningLogger
+
+
+class BaseProcessor(ABC):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        self.tenant = tenant
+        self.model_variables = model_variables
+        self.document_version = document_version
+        self.catalog = catalog
+        self.processor = processor
+        self.tuning = processor.tuning if processor else False
+        self.tuning_logger = None
+        self._setup_tuning_logger()
+
+        self._log_tuning("Processor initialized", {
+            "processor_type": processor.type if processor else None,
+            "document_version": document_version.id if document_version else None,
+            "catalog": catalog.id if catalog else None
+        })
+
+    def _setup_tuning_logger(self):
+        try:
+            self.tuning_logger = TuningLogger(
+                'tuning',
+                tenant_id=self.tenant.id if self.tenant else None,
+                catalog_id=self.catalog.id if self.catalog else None,
+                processor_id=self.processor.id if self.processor else None,
+            )
+            # Verify logger is working with a test message
+            if self.tuning:
+                self.tuning_logger.log_tuning('processor', "Tuning logger initialized")
+        except Exception as e:
+            current_app.logger.error(f"Failed to setup tuning logger: {str(e)}")
+            raise
+
+    @abstractmethod
+    def process(self):
+        pass
+
+    def _save_markdown(self, markdown):
+        markdown_filename = f"{self.document_version.id}.md"
+        minio_client.upload_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            markdown_filename,
+            markdown.encode('utf-8')
+        )
+
+    def _log(self, message, level='debug'):
+        logger = current_app.logger
+        log_method = getattr(logger, level)
+        log_method(
+            f"{self.__class__.__name__} - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
+
+    def _save_intermediate(self, content, filename):
+        minio_client.upload_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            filename,
+            content.encode('utf-8')
+        )
+
+    def _clean_markdown(self, markdown):
+        markdown = markdown.strip()
+        if markdown.startswith("```markdown"):
+            markdown = markdown[len("```markdown"):].strip()
+        if markdown.endswith("```"):
+            markdown = markdown[:-3].strip()
+
+        return markdown
+
+    def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
+        if self.tuning and self.tuning_logger:
+            try:
+                self.tuning_logger.log_tuning('processor', message, data)
+            except Exception as e:
+                current_app.logger.error(f"Processor: Error in tuning logging: {e}")
+
+
--- a/eveai_workers/processors/html_processor.py
+++ b/eveai_workers/processors/html_processor.py
@@ -0,0 +1,163 @@
+from bs4 import BeautifulSoup
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from common.extensions import db, minio_client
+from common.utils.model_utils import create_language_template
+from .base_processor import BaseProcessor
+from common.utils.business_event_context import current_event
+from .processor_registry import ProcessorRegistry
+from common.utils.string_list_converter import StringListConverter as SLC
+
+
+class HTMLProcessor(BaseProcessor):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        super().__init__(tenant, model_variables, document_version, catalog, processor)
+        cat_conf = catalog.configuration
+        proc_conf = processor.configuration
+        self.html_tags = SLC.string_to_list(proc_conf['html_tags'])
+        self.html_end_tags = SLC.string_to_list(proc_conf['html_end_tags'])
+        self.html_included_elements = SLC.string_to_list(proc_conf['html_included_elements'])
+        self.html_excluded_elements = SLC.string_to_list(proc_conf['html_excluded_elements'])
+        self.html_excluded_classes = SLC.string_to_list(proc_conf['html_excluded_classes'])
+        self.tuning = self.processor.tuning
+        # Add verification logging
+        self._log(f"HTML Processor initialized with tuning={self.tuning}")
+        if self.tuning:
+            self._log_tuning("HTML Processor initialized", {
+                "html_tags": self.html_tags,
+                "html_end_tags": self.html_end_tags,
+                "included_elements": self.html_included_elements,
+                "excluded_elements": self.html_excluded_elements
+            })
+
+        self.chunk_size = catalog.max_chunk_size
+
+    def process(self):
+        self._log("Starting HTML processing")
+        try:
+            file_data = minio_client.download_document_file(
+                self.tenant.id,
+                self.document_version.bucket_name,
+                self.document_version.object_name,
+            )
+            html_content = file_data.decode('utf-8')
+
+            with current_event.create_span("HTML Content Extraction"):
+                extracted_html, title = self._parse_html(html_content)
+            with current_event.create_span("Markdown Generation"):
+                markdown = self._generate_markdown_from_html(extracted_html)
+
+            self._save_markdown(markdown)
+            self._log("Finished processing HTML")
+            return markdown, title
+        except Exception as e:
+            self._log(f"Error processing HTML: {str(e)}", level='error')
+            raise
+
+    def _parse_html(self, html_content):
+        self._log(f'Parsing HTML for tenant {self.tenant.id}')
+        soup = BeautifulSoup(html_content, 'html.parser')
+        extracted_html = ''
+        excluded_classes = self._parse_excluded_classes(self.html_excluded_classes)
+
+        if self.html_included_elements:
+            elements_to_parse = soup.find_all(self.html_included_elements)
+        else:
+            elements_to_parse = [soup]
+
+        for element in elements_to_parse:
+            for sub_element in element.find_all(self.html_tags):
+                if self._should_exclude_element(sub_element, excluded_classes):
+                    continue
+                extracted_html += self._extract_element_content(sub_element)
+
+        title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
+
+        self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
+        self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
+        return extracted_html, title
+
+    def _generate_markdown_from_html(self, html_content):
+        self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')
+
+        llm = self.model_variables.get_llm()
+        template = self.model_variables.get_template("html_parse")
+        parse_prompt = ChatPromptTemplate.from_template(template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+        chain = setup | parse_prompt | llm | output_parser
+
+        soup = BeautifulSoup(html_content, 'lxml')
+        chunks = self._split_content(soup, self.chunk_size)
+
+        markdown_chunks = []
+        for chunk in chunks:
+            input_html = {"html": chunk}
+            markdown_chunk = chain.invoke(input_html)
+            markdown_chunks.append(markdown_chunk)
+            self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
+
+        markdown = "\n\n".join(markdown_chunks)
+        self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
+        return markdown
+
+    def _split_content(self, soup, max_size=20000):
+        chunks = []
+        current_chunk = []
+        current_size = 0
+
+        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
+            element_html = str(element)
+            element_size = len(element_html)
+
+            if current_size + element_size > max_size and current_chunk:
+                chunks.append(''.join(map(str, current_chunk)))
+                current_chunk = []
+                current_size = 0
+
+            current_chunk.append(element)
+            current_size += element_size
+
+            if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
+                chunks.append(''.join(map(str, current_chunk)))
+                current_chunk = []
+                current_size = 0
+
+        if current_chunk:
+            chunks.append(''.join(map(str, current_chunk)))
+
+        return chunks
+
+    def _parse_excluded_classes(self, excluded_classes):
+        parsed = {}
+        if excluded_classes:
+            for rule in excluded_classes:
+                element, cls = rule.split('.', 1)
+                parsed.setdefault(element, set()).add(cls)
+        return parsed
+
+    def _should_exclude_element(self, element, excluded_classes):
+        if self.html_excluded_elements and element.find_parent(self.html_excluded_elements):
+            return True
+        return self._is_element_excluded_by_class(element, excluded_classes)
+
+    def _is_element_excluded_by_class(self, element, excluded_classes):
+        for parent in element.parents:
+            if self._element_matches_exclusion(parent, excluded_classes):
+                return True
+        return self._element_matches_exclusion(element, excluded_classes)
+
+    def _element_matches_exclusion(self, element, excluded_classes):
+        if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
+            return True
+        return element.name in excluded_classes and \
+            any(cls in excluded_classes[element.name] for cls in element.get('class', []))
+
+    def _extract_element_content(self, element):
+        content = ' '.join(child.strip() for child in element.stripped_strings)
+        return f'<{element.name}>{content}</{element.name}>\n'
+
+
+# Register the processor
+ProcessorRegistry.register("HTML_PROCESSOR", HTMLProcessor)
--- a/eveai_workers/processors/pdf_processor.py
+++ b/eveai_workers/processors/pdf_processor.py
@@ -0,0 +1,231 @@
+import io
+import pdfplumber
+from flask import current_app
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+import re
+from langchain_core.runnables import RunnablePassthrough
+
+from common.extensions import minio_client
+from common.utils.model_utils import create_language_template
+from .base_processor import BaseProcessor
+from common.utils.business_event_context import current_event
+from .processor_registry import ProcessorRegistry
+
+
+class PDFProcessor(BaseProcessor):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        super().__init__(tenant, model_variables, document_version, catalog, processor)
+
+        self.chunk_size = catalog.max_chunk_size
+        self.chunk_overlap = 0
+        self.tuning = self.processor.tuning
+
+    def process(self):
+        self._log("Starting PDF processing")
+        try:
+            file_data = minio_client.download_document_file(
+                self.tenant.id,
+                self.document_version.bucket_name,
+                self.document_version.object_name,
+            )
+
+            with current_event.create_span("PDF Extraction"):
+                extracted_content = self._extract_content(file_data)
+                structured_content, title = self._structure_content(extracted_content)
+
+            with current_event.create_span("Markdown Generation"):
+                llm_chunks = self._split_content_for_llm(structured_content)
+                markdown = self._process_chunks_with_llm(llm_chunks)
+
+            self._save_markdown(markdown)
+            self._log("Finished processing PDF")
+            return markdown, title
+        except Exception as e:
+            self._log(f"Error processing PDF: {str(e)}", level='error')
+            raise
+
+    def _extract_content(self, file_data):
+        extracted_content = []
+        with pdfplumber.open(io.BytesIO(file_data)) as pdf:
+            figure_counter = 1
+            for page_num, page in enumerate(pdf.pages):
+                self._log(f"Extracting content from page {page_num + 1}")
+                page_content = {
+                    'text': page.extract_text(),
+                    'figures': self._extract_figures(page, page_num, figure_counter),
+                    'tables': self._extract_tables(page)
+                }
+                self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
+                figure_counter += len(page_content['figures'])
+                extracted_content.append(page_content)
+
+        return extracted_content
+
+    def _extract_figures(self, page, page_num, figure_counter):
+        figures = []
+        # Omit figure processing for now!
+        # for img in page.images:
+        #     try:
+        #         # Try to get the bbox, use full page dimensions if not available
+        #         bbox = img.get('bbox', (0, 0, page.width, page.height))
+        #
+        #         figure = {
+        #             'figure_number': figure_counter,
+        #             'filename': f"figure_{page_num + 1}_{figure_counter}.png",
+        #             'caption': self._find_figure_caption(page, bbox)
+        #         }
+        #
+        #         # Extract the figure as an image
+        #         figure_image = page.within_bbox(bbox).to_image()
+        #
+        #         # Save the figure using MinIO
+        #         with io.BytesIO() as output:
+        #             figure_image.save(output, format='PNG')
+        #             output.seek(0)
+        #             minio_client.upload_document_file(
+        #                 self.tenant.id,
+        #                 self.document_version.doc_id,
+        #                 self.document_version.language,
+        #                 self.document_version.id,
+        #                 figure['filename'],
+        #                 output.getvalue()
+        #             )
+        #
+        #         figures.append(figure)
+        #         figure_counter += 1
+        #     except Exception as e:
+        #         self._log(f"Error processing figure on page {page_num + 1}: {str(e)}", level='error')
+
+        return figures
+
+    def _find_figure_caption(self, page, bbox):
+        try:
+            # Look for text below the figure
+            caption_bbox = (bbox[0], bbox[3], bbox[2], min(bbox[3] + 50, page.height))
+            caption_text = page.crop(caption_bbox).extract_text()
+            if caption_text and caption_text.lower().startswith('figure'):
+                return caption_text
+        except Exception as e:
+            self._log(f"Error finding figure caption: {str(e)}", level='error')
+        return None
+
+    def _extract_tables(self, page):
+        tables = []
+        try:
+            for table in page.extract_tables():
+                if table:
+                    markdown_table = self._table_to_markdown(table)
+                    if markdown_table:  # Only add non-empty tables
+                        tables.append(markdown_table)
+                        self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
+        except Exception as e:
+            self._log(f"Error extracting tables from page: {str(e)}", level='error')
+        return tables
+
+    def _table_to_markdown(self, table):
+        if not table or not table[0]:  # Check if table is empty or first row is empty
+            return ""  # Return empty string for empty tables
+
+        def clean_cell(cell):
+            if cell is None:
+                return ""  # Convert None to empty string
+            return str(cell).replace("|", "\\|")  # Escape pipe characters and convert to string
+
+        header = [clean_cell(cell) for cell in table[0]]
+        markdown = "| " + " | ".join(header) + " |\n"
+        markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
+
+        for row in table[1:]:
+            cleaned_row = [clean_cell(cell) for cell in row]
+            markdown += "| " + " | ".join(cleaned_row) + " |\n"
+
+        return markdown
+
+    def _structure_content(self, extracted_content):
+        structured_content = ""
+        title = "Untitled Document"
+        current_heading_level = 0
+        heading_pattern = re.compile(r'^(\d+(\.\d+)*\.?\s*)?(.+)$')
+
+        def identify_heading(text):
+            match = heading_pattern.match(text.strip())
+            if match:
+                numbering, _, content = match.groups()
+                if numbering:
+                    level = numbering.count('.') + 1
+                    return level, f"{numbering}{content}"
+                else:
+                    return 1, content  # Assume it's a top-level heading if no numbering
+            return 0, text  # Not a heading
+
+        for page in extracted_content:
+            # Assume the title is on the first page
+            if page == extracted_content[0]:
+                lines = page.get('text', '').split('\n')
+                if lines:
+                    title = lines[0].strip()  # Use the first non-empty line as the title
+
+            # Process text
+            paragraphs = page['text'].split('\n\n')
+
+            for para in paragraphs:
+                lines = para.strip().split('\n')
+                if len(lines) == 1:  # Potential heading
+                    level, text = identify_heading(lines[0])
+                    if level > 0:
+                        heading_marks = '#' * level
+                        structured_content += f"\n\n{heading_marks} {text}\n\n"
+                        if level == 1 and not title:
+                            title = text  # Use the first top-level heading as the title if not set
+                    else:
+                        structured_content += f"{para}\n\n"  # Treat as normal paragraph
+                else:
+                    structured_content += f"{para}\n\n"  # Multi-line paragraph
+
+            # Process figures
+            for figure in page.get('figures', []):
+                structured_content += f"\n\n![Figure {figure['figure_number']}]({figure['filename']})\n\n"
+                if figure['caption']:
+                    structured_content += f"*Figure {figure['figure_number']}: {figure['caption']}*\n\n"
+
+            # Add tables
+            if 'tables' in page:
+                for table in page['tables']:
+                    structured_content += f"\n{table}\n"
+
+        if self.tuning:
+            self._save_intermediate(structured_content, "structured_content.md")
+
+        return structured_content, title
+
+    def _split_content_for_llm(self, content):
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+        return text_splitter.split_text(content)
+
+    def _process_chunks_with_llm(self, chunks):
+        llm = self.model_variables.get_llm()
+        template = self.model_variables.get_template('pdf_parse')
+        pdf_prompt = ChatPromptTemplate.from_template(template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+        chain = setup | pdf_prompt | llm | output_parser
+
+        markdown_chunks = []
+        for chunk in chunks:
+            input = {"pdf_content": chunk}
+            result = chain.invoke(input)
+            result = self._clean_markdown(result)
+            markdown_chunks.append(result)
+
+        return "\n\n".join(markdown_chunks)
+
+
+# Register the processor
+ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)
--- a/eveai_workers/processors/processor_registry.py
+++ b/eveai_workers/processors/processor_registry.py
@@ -0,0 +1,92 @@
+from typing import Dict, Type, Optional
+from flask import current_app
+from config.processor_types import PROCESSOR_TYPES
+from .base_processor import BaseProcessor
+
+
+class ProcessorRegistry:
+    """Registry for processor types that aligns with PROCESSOR_TYPES configuration"""
+
+    _registry: Dict[str, Type[BaseProcessor]] = {}
+
+    @classmethod
+    def register(cls, processor_type: str, processor_class: Type[BaseProcessor]):
+        """
+        Register a new processor type that must match a type in PROCESSOR_TYPES
+
+        Args:
+            processor_type: Type identifier from PROCESSOR_TYPES
+            processor_class: Processor implementation class
+
+        Raises:
+            ValueError: If processor_type isn't defined in PROCESSOR_TYPES
+        """
+        if processor_type not in PROCESSOR_TYPES:
+            raise ValueError(f"Processor type {processor_type} not found in PROCESSOR_TYPES configuration")
+
+        cls._registry[processor_type] = processor_class
+
+    @classmethod
+    def get_processor_class(cls, processor_type: str) -> Type[BaseProcessor]:
+        """
+        Get the processor class for a given processor type
+
+        Args:
+            processor_type: Type identifier from PROCESSOR_TYPES
+
+        Returns:
+            The registered processor class
+
+        Raises:
+            ValueError: If no processor is registered for the given type
+        """
+        if processor_type not in cls._registry:
+            raise ValueError(f"No processor registered for type: {processor_type}")
+        return cls._registry[processor_type]
+
+    @classmethod
+    def get_processor_for_file_type(cls, file_type: str) -> tuple[str, Type[BaseProcessor]]:
+        """
+        Find appropriate processor for a file type by checking PROCESSOR_TYPES definitions
+
+        Args:
+            file_type: File extension (e.g., 'html', 'pdf')
+
+        Returns:
+            Tuple of (processor_type, processor_class)
+
+        Raises:
+            ValueError: If no processor is found for the file type
+        """
+        # First find which processor type handles this file type
+        for proc_type, config in PROCESSOR_TYPES.items():
+            # Check if file_type is in the supported file_types (handling both string and list formats)
+            supported_types = config['file_types']
+            if isinstance(supported_types, str):
+                supported_types = [t.strip() for t in supported_types.split(',')]
+
+            if file_type in supported_types:
+                # Get the registered processor class for this type
+                if proc_type in cls._registry:
+                    return proc_type, cls._registry[proc_type]
+                else:
+                    raise ValueError(
+                        f"Found processor type {proc_type} for file type {file_type} but no processor is registered")
+
+        raise ValueError(f"No processor type found for file type: {file_type}")
+
+    @classmethod
+    def validate_processor_registration(cls):
+        """
+        Validate that all PROCESSOR_TYPES have registered processors
+
+        Raises:
+            ValueError: If any processor type lacks a registered processor
+        """
+        missing_processors = []
+        for proc_type in PROCESSOR_TYPES.keys():
+            if proc_type not in cls._registry:
+                missing_processors.append(proc_type)
+
+        if missing_processors:
+            raise ValueError(f"Missing processor registrations for: {', '.join(missing_processors)}")
--- a/eveai_workers/processors/srt_processor.py
+++ b/eveai_workers/processors/srt_processor.py
@@ -0,0 +1,32 @@
+from common.extensions import minio_client
+from .transcription_processor import TranscriptionBaseProcessor
+import re
+
+
+class SRTProcessor(TranscriptionBaseProcessor):
+    def _get_transcription(self):
+        file_data = minio_client.download_document_file(
+            self.tenant.id,
+            self.document_version.bucket_name,
+            self.document_version.object_name,
+        )
+        srt_content = file_data.decode('utf-8')
+        return self._clean_srt(srt_content)
+
+    def _clean_srt(self, srt_content):
+        # Remove timecodes and subtitle numbers
+        cleaned_lines = []
+        for line in srt_content.split('\n'):
+            # Skip empty lines, subtitle numbers, and timecodes
+            if line.strip() and not line.strip().isdigit() and not re.match(
+                    r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
+                cleaned_lines.append(line.strip())
+
+        # Join the cleaned lines
+        cleaned_text = ' '.join(cleaned_lines)
+
+        # Remove any extra spaces
+        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+
+        return cleaned_text
+
--- a/eveai_workers/processors/transcription_processor.py
+++ b/eveai_workers/processors/transcription_processor.py
@@ -0,0 +1,98 @@
+# transcription_processor.py
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+
+from common.utils.model_utils import create_language_template
+from .base_processor import BaseProcessor
+from common.utils.business_event_context import current_event
+
+
+class TranscriptionBaseProcessor(BaseProcessor):
+    def __init__(self, tenant, model_variables, document_version, catalog, processor):
+        super().__init__(tenant, model_variables, document_version, catalog, processor)
+        self.annotation_chunk_size = model_variables.annotation_chunk_length
+        self.annotation_chunk_overlap = 0
+
+    def process(self):
+        self._log("Starting Transcription processing")
+        try:
+            with current_event.create_span("Transcription Generation"):
+                transcription = self._get_transcription()
+            with current_event.create_span("Markdown Generation"):
+                chunks = self._chunk_transcription(transcription)
+                markdown_chunks = self._process_chunks(chunks)
+                full_markdown = self._combine_markdown_chunks(markdown_chunks)
+                self._save_markdown(full_markdown)
+                self._log("Finished processing Transcription")
+            return full_markdown, self._extract_title_from_markdown(full_markdown)
+        except Exception as e:
+            self._log(f"Error processing Transcription: {str(e)}", level='error')
+            raise
+
+    def _get_transcription(self):
+        # This method should be implemented by child classes
+        raise NotImplementedError
+
+    def _chunk_transcription(self, transcription):
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.annotation_chunk_size,
+            chunk_overlap=self.annotation_chunk_overlap,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+        return text_splitter.split_text(transcription)
+
+    def _process_chunks(self, chunks):
+        self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
+        llm = self.model_variables.get_llm()
+        template = self.model_variables.get_template('transcript')
+        language_template = create_language_template(template, self.document_version.language)
+        transcript_prompt = ChatPromptTemplate.from_template(language_template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+
+        chain = setup | transcript_prompt | llm | output_parser
+
+        markdown_chunks = []
+        previous_part = ""
+        for i, chunk in enumerate(chunks):
+            input_transcript = {
+                'transcript': chunk,
+                'previous_part': previous_part
+            }
+            markdown = chain.invoke(input_transcript)
+            markdown = self._clean_markdown(markdown)
+            self._log_tuning("_process_chunks", {
+                "Chunk Number": f"{i + 1} of {len(chunks)}",
+                "Chunk": chunk,
+                "Previous Chunk": previous_part,
+                "Markdown": markdown,
+            })
+            markdown_chunks.append(markdown)
+
+            # Extract the last part for the next iteration
+            lines = markdown.split('\n')
+            last_header = None
+            for line in reversed(lines):
+                if line.startswith('#'):
+                    last_header = line
+                    break
+            if last_header:
+                header_index = lines.index(last_header)
+                previous_part = '\n'.join(lines[header_index:])
+            else:
+                previous_part = lines[-1] if lines else ""
+
+        return markdown_chunks
+
+    def _combine_markdown_chunks(self, markdown_chunks):
+        return "\n\n".join(markdown_chunks)
+
+    def _extract_title_from_markdown(self, markdown):
+        lines = markdown.split('\n')
+        for line in lines:
+            if line.startswith('# '):
+                return line[2:].strip()
+        return "Untitled Transcription"