- Improvements on document uploads (accept other files than html-files when entering a URL)

- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api. - Improvements on document processing - introduction of processor classes to streamline document inputs - Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
2024-09-02 12:37:44 +02:00
parent a158655247
commit 914c265afe
21 changed files with 1425 additions and 852 deletions
--- a/eveai_workers/Processors/audio_processor.py
+++ b/eveai_workers/Processors/audio_processor.py
@@ -0,0 +1,187 @@
+import io
+import os
+from pydub import AudioSegment
+import tempfile
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from common.extensions import minio_client
+from common.utils.model_utils import create_language_template
+from .processor import Processor
+import subprocess
+
+
+class AudioProcessor(Processor):
+    def __init__(self, tenant, model_variables, document_version):
+        super().__init__(tenant, model_variables, document_version)
+        self.transcription_client = model_variables['transcription_client']
+        self.transcription_model = model_variables['transcription_model']
+        self.ffmpeg_path = 'ffmpeg'
+
+
+    def process(self):
+        self._log("Starting Audio processing")
+        try:
+            file_data = minio_client.download_document_file(
+                self.tenant.id,
+                self.document_version.doc_id,
+                self.document_version.language,
+                self.document_version.id,
+                self.document_version.file_name
+            )
+
+            compressed_audio = self._compress_audio(file_data)
+            transcription = self._transcribe_audio(compressed_audio)
+            markdown, title = self._generate_markdown_from_transcription(transcription)
+
+            self._save_markdown(markdown)
+            self._log("Finished processing Audio")
+            return markdown, title
+        except Exception as e:
+            self._log(f"Error processing Audio: {str(e)}", level='error')
+            raise
+
+    def _compress_audio(self, audio_data):
+        self._log("Compressing audio")
+        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input:
+            temp_input.write(audio_data)
+            temp_input.flush()
+
+            # Use a unique filename for the output to avoid conflicts
+            output_filename = f'compressed_{os.urandom(8).hex()}.mp3'
+            output_path = os.path.join(tempfile.gettempdir(), output_filename)
+
+            try:
+                result = subprocess.run(
+                    [self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path],
+                    capture_output=True,
+                    text=True,
+                    check=True
+                )
+
+                with open(output_path, 'rb') as f:
+                    compressed_data = f.read()
+
+                # Save compressed audio to MinIO
+                compressed_filename = f"{self.document_version.id}_compressed.mp3"
+                minio_client.upload_document_file(
+                    self.tenant.id,
+                    self.document_version.doc_id,
+                    self.document_version.language,
+                    self.document_version.id,
+                    compressed_filename,
+                    compressed_data
+                )
+                self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
+
+                return compressed_data
+
+            except subprocess.CalledProcessError as e:
+                error_message = f"Compression failed: {e.stderr}"
+                self._log(error_message, level='error')
+                raise Exception(error_message)
+
+            finally:
+                # Clean up temporary files
+                os.unlink(temp_input.name)
+                if os.path.exists(output_path):
+                    os.unlink(output_path)
+
+    def _transcribe_audio(self, audio_data):
+        self._log("Starting audio transcription")
+        audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
+
+        segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
+        transcriptions = []
+
+        for i, chunk in enumerate(audio[::segment_length]):
+            self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}')
+
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
+                chunk.export(temp_audio.name, format="mp3")
+                temp_audio.flush()
+
+                try:
+                    file_size = os.path.getsize(temp_audio.name)
+                    self._log(f"Temporary audio file size: {file_size} bytes")
+
+                    with open(temp_audio.name, 'rb') as audio_file:
+                        file_start = audio_file.read(100)
+                        self._log(f"First 100 bytes of audio file: {file_start}")
+                        audio_file.seek(0)  # Reset file pointer to the beginning
+
+                        self._log("Calling transcription API")
+                        transcription = self.transcription_client.audio.transcriptions.create(
+                            file=audio_file,
+                            model=self.transcription_model,
+                            language=self.document_version.language,
+                            response_format='verbose_json',
+                        )
+                        self._log("Transcription API call completed")
+
+                    if transcription:
+                        # Handle the transcription result based on its type
+                        if isinstance(transcription, str):
+                            self._log(f"Transcription result (string): {transcription[:100]}...")
+                            transcriptions.append(transcription)
+                        elif hasattr(transcription, 'text'):
+                            self._log(
+                                f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
+                            transcriptions.append(transcription.text)
+                        else:
+                            self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
+                            transcriptions.append(str(transcription))
+                    else:
+                        self._log("Warning: Received empty transcription", level='warning')
+
+                except Exception as e:
+                    self._log(f"Error during transcription: {str(e)}", level='error')
+                finally:
+                    os.unlink(temp_audio.name)
+
+        full_transcription = " ".join(filter(None, transcriptions))
+
+        if not full_transcription:
+            self._log("Warning: No transcription was generated", level='warning')
+            full_transcription = "No transcription available."
+
+        # Save transcription to MinIO
+        transcription_filename = f"{self.document_version.id}_transcription.txt"
+        minio_client.upload_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            transcription_filename,
+            full_transcription.encode('utf-8')
+        )
+        self._log(f"Saved transcription to MinIO: {transcription_filename}")
+
+        return full_transcription
+
+    def _generate_markdown_from_transcription(self, transcription):
+        self._log("Generating markdown from transcription")
+        llm = self.model_variables['llm']
+        template = self.model_variables['transcript_template']
+        language_template = create_language_template(template, self.document_version.language)
+        transcript_prompt = ChatPromptTemplate.from_template(language_template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+
+        chain = setup | transcript_prompt | llm | output_parser
+
+        input_transcript = {'transcript': transcription}
+        markdown = chain.invoke(input_transcript)
+
+        # Extract title from the markdown
+        title = self._extract_title_from_markdown(markdown)
+
+        return markdown, title
+
+    def _extract_title_from_markdown(self, markdown):
+        # Simple extraction of the first header as the title
+        lines = markdown.split('\n')
+        for line in lines:
+            if line.startswith('# '):
+                return line[2:].strip()
+        return "Untitled Audio Transcription"
--- a/eveai_workers/Processors/html_processor.py
+++ b/eveai_workers/Processors/html_processor.py
@@ -0,0 +1,142 @@
+from bs4 import BeautifulSoup
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from common.extensions import db, minio_client
+from common.utils.model_utils import create_language_template
+from .processor import Processor
+
+
+class HTMLProcessor(Processor):
+    def __init__(self, tenant, model_variables, document_version):
+        super().__init__(tenant, model_variables, document_version)
+        self.html_tags = model_variables['html_tags']
+        self.html_end_tags = model_variables['html_end_tags']
+        self.html_included_elements = model_variables['html_included_elements']
+        self.html_excluded_elements = model_variables['html_excluded_elements']
+
+    def process(self):
+        self._log("Starting HTML processing")
+        try:
+            file_data = minio_client.download_document_file(
+                self.tenant.id,
+                self.document_version.doc_id,
+                self.document_version.language,
+                self.document_version.id,
+                self.document_version.file_name
+            )
+            html_content = file_data.decode('utf-8')
+
+            extracted_html, title = self._parse_html(html_content)
+            markdown = self._generate_markdown_from_html(extracted_html)
+
+            self._save_markdown(markdown)
+            self._log("Finished processing HTML")
+            return markdown, title
+        except Exception as e:
+            self._log(f"Error processing HTML: {str(e)}", level='error')
+            raise
+
+    def _parse_html(self, html_content):
+        self._log(f'Parsing HTML for tenant {self.tenant.id}')
+        soup = BeautifulSoup(html_content, 'html.parser')
+        extracted_html = ''
+        excluded_classes = self._parse_excluded_classes(self.tenant.html_excluded_classes)
+
+        if self.html_included_elements:
+            elements_to_parse = soup.find_all(self.html_included_elements)
+        else:
+            elements_to_parse = [soup]
+
+        for element in elements_to_parse:
+            for sub_element in element.find_all(self.html_tags):
+                if self._should_exclude_element(sub_element, excluded_classes):
+                    continue
+                extracted_html += self._extract_element_content(sub_element)
+
+        title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
+
+        self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
+        return extracted_html, title
+
+    def _generate_markdown_from_html(self, html_content):
+        self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')
+
+        llm = self.model_variables['llm']
+        template = self.model_variables['html_parse_template']
+        parse_prompt = ChatPromptTemplate.from_template(template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+        chain = setup | parse_prompt | llm | output_parser
+
+        soup = BeautifulSoup(html_content, 'lxml')
+        chunks = self._split_content(soup)
+
+        markdown_chunks = []
+        for chunk in chunks:
+            if self.embed_tuning:
+                self._log(f'Processing chunk: \n{chunk}\n')
+            input_html = {"html": chunk}
+            markdown_chunk = chain.invoke(input_html)
+            markdown_chunks.append(markdown_chunk)
+            if self.embed_tuning:
+                self._log(f'Processed markdown chunk: \n{markdown_chunk}\n')
+
+        markdown = "\n\n".join(markdown_chunks)
+        self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
+        return markdown
+
+    def _split_content(self, soup, max_size=20000):
+        chunks = []
+        current_chunk = []
+        current_size = 0
+
+        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
+            element_html = str(element)
+            element_size = len(element_html)
+
+            if current_size + element_size > max_size and current_chunk:
+                chunks.append(''.join(map(str, current_chunk)))
+                current_chunk = []
+                current_size = 0
+
+            current_chunk.append(element)
+            current_size += element_size
+
+            if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
+                chunks.append(''.join(map(str, current_chunk)))
+                current_chunk = []
+                current_size = 0
+
+        if current_chunk:
+            chunks.append(''.join(map(str, current_chunk)))
+
+        return chunks
+
+    def _parse_excluded_classes(self, excluded_classes):
+        parsed = {}
+        for rule in excluded_classes:
+            element, cls = rule.split('.', 1)
+            parsed.setdefault(element, set()).add(cls)
+        return parsed
+
+    def _should_exclude_element(self, element, excluded_classes):
+        if self.html_excluded_elements and element.find_parent(self.html_excluded_elements):
+            return True
+        return self._is_element_excluded_by_class(element, excluded_classes)
+
+    def _is_element_excluded_by_class(self, element, excluded_classes):
+        for parent in element.parents:
+            if self._element_matches_exclusion(parent, excluded_classes):
+                return True
+        return self._element_matches_exclusion(element, excluded_classes)
+
+    def _element_matches_exclusion(self, element, excluded_classes):
+        if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
+            return True
+        return element.name in excluded_classes and \
+            any(cls in excluded_classes[element.name] for cls in element.get('class', []))
+
+    def _extract_element_content(self, element):
+        content = ' '.join(child.strip() for child in element.stripped_strings)
+        return f'<{element.name}>{content}</{element.name}>\n'
--- a/eveai_workers/Processors/pdf_processor.py
+++ b/eveai_workers/Processors/pdf_processor.py
@@ -5,29 +5,23 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 import re
-
 from langchain_core.runnables import RunnablePassthrough

 from common.extensions import minio_client
 from common.utils.model_utils import create_language_template
+from .processor import Processor


-class PDFProcessor:
+class PDFProcessor(Processor):
    def __init__(self, tenant, model_variables, document_version):
-        self.tenant = tenant
-        self.model_variables = model_variables
-        self.document_version = document_version
-
-        # Configuration parameters from model_variables
+        super().__init__(tenant, model_variables, document_version)
+        # PDF-specific initialization
        self.chunk_size = model_variables['PDF_chunk_size']
        self.chunk_overlap = model_variables['PDF_chunk_overlap']
        self.min_chunk_size = model_variables['PDF_min_chunk_size']
        self.max_chunk_size = model_variables['PDF_max_chunk_size']

-        # Set tuning variable for easy use
-        self.embed_tuning = model_variables['embed_tuning']
-
-    def process_pdf(self):
+    def process(self):
        self._log("Starting PDF processing")
        try:
            file_data = minio_client.download_document_file(
@@ -51,11 +45,6 @@ class PDFProcessor:
            self._log(f"Error processing PDF: {str(e)}", level='error')
            raise

-    def _log(self, message, level='debug'):
-        logger = current_app.logger
-        log_method = getattr(logger, level)
-        log_method(f"PDFProcessor - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
-
    def _extract_content(self, file_data):
        extracted_content = []
        with pdfplumber.open(io.BytesIO(file_data)) as pdf:
@@ -248,24 +237,3 @@ class PDFProcessor:
            markdown_chunks.append(result)

        return "\n\n".join(markdown_chunks)
-
-    def _save_markdown(self, markdown):
-        markdown_filename = f"{self.document_version.id}.md"
-        minio_client.upload_document_file(
-            self.tenant.id,
-            self.document_version.doc_id,
-            self.document_version.language,
-            self.document_version.id,
-            markdown_filename,
-            markdown.encode('utf-8')
-        )
-
-    def _save_intermediate(self, content, filename):
-        minio_client.upload_document_file(
-            self.tenant.id,
-            self.document_version.doc_id,
-            self.document_version.language,
-            self.document_version.id,
-            filename,
-            content.encode('utf-8')
-        )
--- a/eveai_workers/Processors/processor.py
+++ b/eveai_workers/Processors/processor.py
@@ -0,0 +1,42 @@
+from abc import ABC, abstractmethod
+from flask import current_app
+from common.extensions import minio_client
+
+
+class Processor(ABC):
+    def __init__(self, tenant, model_variables, document_version):
+        self.tenant = tenant
+        self.model_variables = model_variables
+        self.document_version = document_version
+        self.embed_tuning = model_variables['embed_tuning']
+
+    @abstractmethod
+    def process(self):
+        pass
+
+    def _save_markdown(self, markdown):
+        markdown_filename = f"{self.document_version.id}.md"
+        minio_client.upload_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            markdown_filename,
+            markdown.encode('utf-8')
+        )
+
+    def _log(self, message, level='debug'):
+        logger = current_app.logger
+        log_method = getattr(logger, level)
+        log_method(
+            f"{self.__class__.__name__} - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
+
+    def _save_intermediate(self, content, filename):
+        minio_client.upload_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            filename,
+            content.encode('utf-8')
+        )
--- a/eveai_workers/Processors/srt_processor.py
+++ b/eveai_workers/Processors/srt_processor.py
@@ -0,0 +1,80 @@
+import re
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from common.extensions import minio_client
+from common.utils.model_utils import create_language_template
+from .processor import Processor
+
+
+class SRTProcessor(Processor):
+    def __init__(self, tenant, model_variables, document_version):
+        super().__init__(tenant, model_variables, document_version)
+
+    def process(self):
+        self._log("Starting SRT processing")
+        try:
+            file_data = minio_client.download_document_file(
+                self.tenant.id,
+                self.document_version.doc_id,
+                self.document_version.language,
+                self.document_version.id,
+                self.document_version.file_name
+            )
+
+            srt_content = file_data.decode('utf-8')
+            cleaned_transcription = self._clean_srt(srt_content)
+            markdown, title = self._generate_markdown_from_transcription(cleaned_transcription)
+
+            self._save_markdown(markdown)
+            self._log("Finished processing SRT")
+            return markdown, title
+        except Exception as e:
+            self._log(f"Error processing SRT: {str(e)}", level='error')
+            raise
+
+    def _clean_srt(self, srt_content):
+        # Remove timecodes and subtitle numbers
+        cleaned_lines = []
+        for line in srt_content.split('\n'):
+            # Skip empty lines, subtitle numbers, and timecodes
+            if line.strip() and not line.strip().isdigit() and not re.match(
+                    r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
+                cleaned_lines.append(line.strip())
+
+        # Join the cleaned lines
+        cleaned_text = ' '.join(cleaned_lines)
+
+        # Remove any extra spaces
+        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+
+        return cleaned_text
+
+    def _generate_markdown_from_transcription(self, transcription):
+        self._log("Generating markdown from transcription")
+        llm = self.model_variables['llm']
+        template = self.model_variables['transcript_template']
+        language_template = create_language_template(template, self.document_version.language)
+        transcript_prompt = ChatPromptTemplate.from_template(language_template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+
+        chain = setup | transcript_prompt | llm | output_parser
+
+        input_transcript = {'transcript': transcription}
+        markdown = chain.invoke(input_transcript)
+
+        # Extract title from the markdown
+        title = self._extract_title_from_markdown(markdown)
+
+        return markdown, title
+
+    def _extract_title_from_markdown(self, markdown):
+        # Simple extraction of the first header as the title
+        lines = markdown.split('\n')
+        for line in lines:
+            if line.startswith('# '):
+                return line[2:].strip()
+        return "Untitled SRT Transcription"
+
+
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -1,26 +1,16 @@
 import io
 import os
 from datetime import datetime as dt, timezone as tz
-import subprocess

-import gevent
-from bs4 import BeautifulSoup
-import html
 from celery import states
 from flask import current_app
 # OpenAI imports
-from langchain.chains.summarize import load_summarize_chain
-from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
-from langchain_core.documents import Document
+from langchain.text_splitter import MarkdownHeaderTextSplitter
 from langchain_core.exceptions import LangChainException
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 from sqlalchemy.exc import SQLAlchemyError
-from pytube import YouTube
-import PyPDF2
-from pydub import AudioSegment
-import tempfile

 from common.extensions import db, minio_client
 from common.models.document import DocumentVersion, Embedding
@@ -29,7 +19,10 @@ from common.utils.celery_utils import current_celery
 from common.utils.database import Database
 from common.utils.model_utils import select_model_variables, create_language_template
 from common.utils.os_utils import safe_remove, sync_folder
-from eveai_workers.Processors.PDF_Processor import PDFProcessor
+from eveai_workers.Processors.audio_processor import AudioProcessor
+from eveai_workers.Processors.html_processor import HTMLProcessor
+from eveai_workers.Processors.pdf_processor import PDFProcessor
+from eveai_workers.Processors.srt_processor import SRTProcessor


@current_celery.task(name='create_embeddings', queue='embeddings')
@@ -85,8 +78,10 @@ def create_embeddings(tenant_id, document_version_id):
                process_pdf(tenant, model_variables, document_version)
            case 'html':
                process_html(tenant, model_variables, document_version)
-            case 'youtube':
-                process_youtube(tenant, model_variables, document_version)
+            case 'srt':
+                process_srt(tenant, model_variables, document_version)
+            case 'mp4' | 'mp3' | 'ogg':
+                process_audio(tenant, model_variables, document_version)
            case _:
                raise Exception(f'No functionality defined for file type {document_version.file_type} '
                                f'for tenant {tenant_id} '
@@ -104,83 +99,6 @@ def create_embeddings(tenant_id, document_version_id):
        raise


-# def process_pdf(tenant, model_variables, document_version):
-#     file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
-#                                                     document_version.id, document_version.file_name)
-#
-#     pdf_text = ''
-#     pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
-#     for page in pdf_reader.pages:
-#         pdf_text += page.extract_text()
-#
-#     markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
-#     markdown_file_name = f'{document_version.id}.md'
-#     minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
-#                                       document_version.id,
-#                                       markdown_file_name, markdown.encode())
-#
-#     potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
-#     chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
-#                                          model_variables['max_chunk_size'])
-#
-#     if len(chunks) > 1:
-#         summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
-#         document_version.system_context = f'Summary: {summary}\n'
-#     else:
-#         document_version.system_context = ''
-#
-#     enriched_chunks = enrich_chunks(tenant, document_version, chunks)
-#     embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
-#
-#     try:
-#         db.session.add(document_version)
-#         document_version.processing_finished_at = dt.now(tz.utc)
-#         document_version.processing = False
-#         db.session.add_all(embeddings)
-#         db.session.commit()
-#     except SQLAlchemyError as e:
-#         current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
-#                                  f'on HTML, document version {document_version.id}'
-#                                  f'error: {e}')
-#         raise
-#
-#     current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
-#                             f'on document version {document_version.id} :-)')
-
-def process_pdf(tenant, model_variables, document_version):
-    processor = PDFProcessor(tenant, model_variables, document_version)
-    markdown, title = processor.process_pdf()
-
-    # Create potential chunks for embedding
-    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
-
-    # Combine chunks for embedding
-    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
-                                         model_variables['max_chunk_size'])
-
-    # Enrich chunks
-    enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
-
-    # Create embeddings
-    embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
-
-    # Update document version and save embeddings
-    try:
-        db.session.add(document_version)
-        document_version.processing_finished_at = dt.now(tz.utc)
-        document_version.processing = False
-        db.session.add_all(embeddings)
-        db.session.commit()
-    except SQLAlchemyError as e:
-        current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
-                                 f'on PDF, document version {document_version.id}'
-                                 f'error: {e}')
-        raise
-
-    current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
-                            f'on document version {document_version.id} :-)')
-
-
 def delete_embeddings_for_document_version(document_version):
    embeddings_to_delete = db.session.query(Embedding).filter_by(doc_vers_id=document_version.id).all()
    for embedding in embeddings_to_delete:
@@ -193,45 +111,53 @@ def delete_embeddings_for_document_version(document_version):
        raise


+def process_pdf(tenant, model_variables, document_version):
+    processor = PDFProcessor(tenant, model_variables, document_version)
+    markdown, title = processor.process()
+
+    # Process markdown and embed
+    embed_markdown(tenant, model_variables, document_version, markdown, title)
+
+
 def process_html(tenant, model_variables, document_version):
-    file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                                    document_version.id, document_version.file_name)
-    html_content = file_data.decode('utf-8')
+    processor = HTMLProcessor(tenant, model_variables, document_version)
+    markdown, title = processor.process()

-    # The tags to be considered can be dependent on the tenant
-    html_tags = model_variables['html_tags']
-    html_end_tags = model_variables['html_end_tags']
-    html_included_elements = model_variables['html_included_elements']
-    html_excluded_elements = model_variables['html_excluded_elements']
+    # Process markdown and embed
+    embed_markdown(tenant, model_variables, document_version, markdown, title)

-    extracted_html, title = parse_html(tenant, html_content, html_tags, included_elements=html_included_elements,
-                                       excluded_elements=html_excluded_elements)

-    extracted_file_name = f'{document_version.id}-extracted.html'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                      document_version.id,
-                                      extracted_file_name, extracted_html.encode())
+def process_audio(tenant, model_variables, document_version):
+    processor = AudioProcessor(tenant, model_variables, document_version)
+    markdown, title = processor.process()

-    markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
-    markdown_file_name = f'{document_version.id}.md'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                      document_version.id,
-                                      markdown_file_name, markdown.encode())
+    # Process markdown and embed
+    embed_markdown(tenant, model_variables, document_version, markdown, title)

-    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
+
+def process_srt(tenant, model_variables, document_version):
+    processor = SRTProcessor(tenant, model_variables, document_version)
+    markdown, title = processor.process()
+
+    # Process markdown and embed
+    embed_markdown(tenant, model_variables, document_version, markdown, title)
+
+
+def embed_markdown(tenant, model_variables, document_version, markdown, title):
+    # Create potential chunks
+    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
+
+    # Combine chunks for embedding
    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
                                         model_variables['max_chunk_size'])

-    if len(chunks) > 1:
-        summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
-        document_version.system_context = (f'Title: {title}\n'
-                                           f'Summary: {summary}\n')
-    else:
-        document_version.system_context = (f'Title: {title}\n')
+    # Enrich chunks
+    enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)

-    enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
+    # Create embeddings
    embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)

+    # Update document version and save embeddings
    try:
        db.session.add(document_version)
        document_version.processing_finished_at = dt.now(tz.utc)
@@ -248,12 +174,18 @@ def process_html(tenant, model_variables, document_version):
                            f'on document version {document_version.id} :-)')


-def enrich_chunks(tenant, document_version, title, chunks):
+def enrich_chunks(tenant, model_variables, document_version, title, chunks):
    current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
                             f'on document version {document_version.id}')
-    current_app.logger.debug(f'Nr of chunks: {len(chunks)}')
+
+    summary = ''
+    if len(chunks) > 1:
+        summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
+
    chunk_total_context = (f'Filename: {document_version.file_name}\n'
                           f'User Context:\n{document_version.user_context}\n\n'
+                           f'Title: {title}\n'
+                           f'{summary}\n'
                           f'{document_version.system_context}\n\n')
    enriched_chunks = []
    initial_chunk = (f'Filename: {document_version.file_name}\n'
@@ -272,95 +204,6 @@ def enrich_chunks(tenant, document_version, title, chunks):
    return enriched_chunks


-def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
-    current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-
-    llm = model_variables['llm']
-    template = model_variables['html_parse_template']
-    parse_prompt = ChatPromptTemplate.from_template(template)
-    setup = RunnablePassthrough()
-    output_parser = StrOutputParser()
-    chain = setup | parse_prompt | llm | output_parser
-
-    soup = BeautifulSoup(html_content, 'lxml')
-
-    def split_content(soup, max_size=20000):
-        chunks = []
-        current_chunk = []
-        current_size = 0
-
-        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
-            element_html = str(element)
-            element_size = len(element_html)
-
-            if current_size + element_size > max_size and current_chunk:
-                chunks.append(''.join(map(str, current_chunk)))
-                current_chunk = []
-                current_size = 0
-
-            current_chunk.append(element)
-            current_size += element_size
-
-            if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
-                chunks.append(''.join(map(str, current_chunk)))
-                current_chunk = []
-                current_size = 0
-
-        if current_chunk:
-            chunks.append(''.join(map(str, current_chunk)))
-
-        return chunks
-
-    chunks = split_content(soup)
-
-    markdown_chunks = []
-
-    for chunk in chunks:
-        current_app.logger.debug(f'Processing chunk to generate markdown from HTML for tenant {tenant.id} '
-                                 f'on document version {document_version.id}')
-        if tenant.embed_tuning:
-            current_app.embed_tuning_logger.debug(f'Processing chunk: \n '
-                                                  f'------------------\n'
-                                                  f'{chunk}\n'
-                                                  f'------------------\n')
-        input_html = {"html": chunk}
-        markdown_chunk = chain.invoke(input_html)
-        markdown_chunks.append(markdown_chunk)
-        if tenant.embed_tuning:
-            current_app.embed_tuning_logger.debug(f'Processed markdown chunk: \n '
-                                                  f'-------------------------\n'
-                                                  f'{markdown_chunk}\n'
-                                                  f'-------------------------\n')
-        current_app.logger.debug(f'Finished processing chunk to generate markdown from HTML for tenant {tenant.id} '
-                                 f'on document version {document_version.id}')
-
-    # Combine all markdown chunks
-    markdown = "\n\n".join(markdown_chunks)
-
-    current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-
-    return markdown
-
-
-def generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_content):
-    current_app.logger.debug(f'Generating Markdown from PDF for tenant {tenant.id} '
-                             f'on document version {document_version.id}')
-    llm = model_variables['llm']
-    template = model_variables['pdf_parse_template']
-    parse_prompt = ChatPromptTemplate.from_template(template)
-    setup = RunnablePassthrough()
-    output_parser = StrOutputParser()
-
-    chain = setup | parse_prompt | llm | output_parser
-    input_pdf = {"pdf_content": pdf_content}
-
-    markdown = chain.invoke(input_pdf)
-
-    return markdown
-
-
 def summarize_chunk(tenant, model_variables, document_version, chunk):
    current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
                             f'on document version {document_version.id}')
@@ -415,65 +258,6 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
    return new_embeddings


-def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
-    current_app.logger.debug(f'Parsing HTML for tenant {tenant.id}')
-    soup = BeautifulSoup(html_content, 'html.parser')
-    extracted_html = ''
-    excluded_classes = parse_excluded_classes(tenant.html_excluded_classes)
-
-    if included_elements:
-        elements_to_parse = soup.find_all(included_elements)
-    else:
-        elements_to_parse = [soup]
-
-    log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse)
-
-    for element in elements_to_parse:
-        for sub_element in element.find_all(tags):
-            if should_exclude_element(sub_element, excluded_elements, excluded_classes):
-                continue
-            extracted_html += extract_element_content(sub_element)
-
-    title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
-
-    current_app.logger.debug(f'Finished parsing HTML for tenant {tenant.id}')
-
-    return extracted_html, title
-
-
-def parse_excluded_classes(excluded_classes):
-    parsed = {}
-    for rule in excluded_classes:
-        element, cls = rule.split('.', 1)
-        parsed.setdefault(element, set()).add(cls)
-    return parsed
-
-
-def should_exclude_element(element, excluded_elements, excluded_classes):
-    if excluded_elements and element.find_parent(excluded_elements):
-        return True
-    return is_element_excluded_by_class(element, excluded_classes)
-
-
-def is_element_excluded_by_class(element, excluded_classes):
-    for parent in element.parents:
-        if element_matches_exclusion(parent, excluded_classes):
-            return True
-    return element_matches_exclusion(element, excluded_classes)
-
-
-def element_matches_exclusion(element, excluded_classes):
-    if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
-        return True
-    return element.name in excluded_classes and \
-        any(cls in excluded_classes[element.name] for cls in element.get('class', []))
-
-
-def extract_element_content(element):
-    content = ' '.join(child.strip() for child in element.stripped_strings)
-    return f'<{element.name}>{content}</{element.name}>\n'
-
-
 def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
    if tenant.embed_tuning:
        current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
@@ -484,244 +268,242 @@ def log_parsing_info(tenant, tags, included_elements, excluded_elements, exclude
        current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')


-def process_youtube(tenant, model_variables, document_version):
-    base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
-                             document_version.file_location)
-    download_file_name = f'{document_version.id}.mp4'
-    compressed_file_name = f'{document_version.id}.mp3'
-    transcription_file_name = f'{document_version.id}.txt'
-    markdown_file_name = f'{document_version.id}.md'
-
-    # Remove existing files (in case of a re-processing of the file
-    minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                      document_version.id, download_file_name)
-    minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                      document_version.id, compressed_file_name)
-    minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                      document_version.id, transcription_file_name)
-    minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                      document_version.id, markdown_file_name)
-
-    of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
-                                                      download_file_name)
-    document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
-    compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
-    transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
-    annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)
-
-    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
-    actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
-                                                model_variables['max_chunk_size'])
-
-    enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
-    embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
-
-    try:
-        db.session.add(document_version)
-        document_version.processing_finished_at = dt.now(tz.utc)
-        document_version.processing = False
-        db.session.add_all(embeddings)
-        db.session.commit()
-    except SQLAlchemyError as e:
-        current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
-                                 f'on Youtube document version {document_version.id}'
-                                 f'error: {e}')
-        raise
-
-    current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
-                            f'on Youtube document version {document_version.id} :-)')
-
-
-def download_youtube(url, tenant_id, document_version, file_name):
-    try:
-        current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
-        yt = YouTube(url)
-        stream = yt.streams.get_audio_only()
-
-        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
-            stream.download(output_path=temp_file.name)
-            with open(temp_file.name, 'rb') as f:
-                file_data = f.read()
-
-        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
-                                          document_version.id,
-                                          file_name, file_data)
-
-        current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
-        return file_name, yt.title, yt.description, yt.author
-    except Exception as e:
-        current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
-        raise
-
-
-def compress_audio(tenant_id, document_version, input_file, output_file):
-    try:
-        current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
-
-        input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
-                                                         document_version.id, input_file)
-
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
-            temp_input.write(input_data)
-            temp_input.flush()
-
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
-                result = subprocess.run(
-                    ['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
-                    capture_output=True,
-                    text=True
-                )
-
-                if result.returncode != 0:
-                    raise Exception(f"Compression failed: {result.stderr}")
-
-                with open(temp_output.name, 'rb') as f:
-                    compressed_data = f.read()
-
-        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
-                                          document_version.id,
-                                          output_file, compressed_data)
-
-        current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
-    except Exception as e:
-        current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
-        raise
-
-
-def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
-    try:
-        current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
-        client = model_variables['transcription_client']
-        model = model_variables['transcription_model']
-
-        # Download the audio file from MinIO
-        audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
-                                                         document_version.id, input_file)
-
-        # Load the audio data into pydub
-        audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
-
-        # Define segment length (e.g., 10 minutes)
-        segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
-
-        transcriptions = []
-
-        # Split audio into segments and transcribe each
-        for i, chunk in enumerate(audio[::segment_length]):
-            current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
-
-            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
-                chunk.export(temp_audio.name, format="mp3")
-
-                with open(temp_audio.name, 'rb') as audio_segment:
-                    transcription = client.audio.transcriptions.create(
-                        file=audio_segment,
-                        model=model,
-                        language=document_version.language,
-                        response_format='verbose_json',
-                    )
-
-                transcriptions.append(transcription.text)
-
-            os.unlink(temp_audio.name)  # Delete the temporary file
-
-        # Combine all transcriptions
-        full_transcription = " ".join(transcriptions)
-
-        # Upload the full transcription to MinIO
-        minio_client.upload_document_file(
-            tenant_id,
-            document_version.doc_id,
-            document_version.language,
-            document_version.id,
-            output_file,
-            full_transcription.encode('utf-8')
-        )
-
-        current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
-    except Exception as e:
-        current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
-        raise
-
-
-def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
-    try:
-        current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')
-
-        char_splitter = CharacterTextSplitter(separator='.',
-                                              chunk_size=model_variables['annotation_chunk_length'],
-                                              chunk_overlap=0)
-
-        headers_to_split_on = [
-            ("#", "Header 1"),
-            ("##", "Header 2"),
-        ]
-        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
-
-        llm = model_variables['llm']
-        template = model_variables['transcript_template']
-        language_template = create_language_template(template, document_version.language)
-        transcript_prompt = ChatPromptTemplate.from_template(language_template)
-        setup = RunnablePassthrough()
-        output_parser = StrOutputParser()
-
-        # Download the transcription file from MinIO
-        transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
-                                                              document_version.language, document_version.id,
-                                                              input_file)
-        transcript = transcript_data.decode('utf-8')
-
-        chain = setup | transcript_prompt | llm | output_parser
-
-        chunks = char_splitter.split_text(transcript)
-        all_markdown_chunks = []
-        last_markdown_chunk = ''
-        for chunk in chunks:
-            current_app.logger.debug(f'Annotating next chunk of {len(chunks)} for tenant {tenant.id}')
-            full_input = last_markdown_chunk + '\n' + chunk
-            if tenant.embed_tuning:
-                current_app.embed_tuning_logger.debug(f'Annotating chunk: \n '
-                                                      f'------------------\n'
-                                                      f'{full_input}\n'
-                                                      f'------------------\n')
-            input_transcript = {'transcript': full_input}
-            markdown = chain.invoke(input_transcript)
-            # GPT-4o returns some kind of content description: ```markdown <text> ```
-            if markdown.startswith("```markdown"):
-                markdown = "\n".join(markdown.strip().split("\n")[1:-1])
-            if tenant.embed_tuning:
-                current_app.embed_tuning_logger.debug(f'Markdown Received: \n '
-                                                      f'------------------\n'
-                                                      f'{markdown}\n'
-                                                      f'------------------\n')
-            md_header_splits = markdown_splitter.split_text(markdown)
-            markdown_chunks = [doc.page_content for doc in md_header_splits]
-            # claude-3.5-sonnet returns introductory text
-            if not markdown_chunks[0].startswith('#'):
-                markdown_chunks.pop(0)
-            last_markdown_chunk = markdown_chunks[-1]
-            last_markdown_chunk = "\n".join(markdown.strip().split("\n")[1:])
-            markdown_chunks.pop()
-            all_markdown_chunks += markdown_chunks
-
-        all_markdown_chunks += [last_markdown_chunk]
-
-        annotated_transcript = '\n'.join(all_markdown_chunks)
-
-        # Upload the annotated transcript to MinIO
-        minio_client.upload_document_file(
-            tenant.id,
-            document_version.doc_id,
-            document_version.language,
-            document_version.id,
-            output_file,
-            annotated_transcript.encode('utf-8')
-        )
-
-        current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
-    except Exception as e:
-        current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
-        raise
+# def process_youtube(tenant, model_variables, document_version):
+#     download_file_name = f'{document_version.id}.mp4'
+#     compressed_file_name = f'{document_version.id}.mp3'
+#     transcription_file_name = f'{document_version.id}.txt'
+#     markdown_file_name = f'{document_version.id}.md'
+#
+#     # Remove existing files (in case of a re-processing of the file
+#     minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
+#                                       document_version.id, download_file_name)
+#     minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
+#                                       document_version.id, compressed_file_name)
+#     minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
+#                                       document_version.id, transcription_file_name)
+#     minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
+#                                       document_version.id, markdown_file_name)
+#
+#     of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
+#                                                       download_file_name)
+#     document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
+#     compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
+#     transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
+#     annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)
+#
+#     potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
+#     actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
+#                                                 model_variables['max_chunk_size'])
+#
+#     enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
+#     embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
+#
+#     try:
+#         db.session.add(document_version)
+#         document_version.processing_finished_at = dt.now(tz.utc)
+#         document_version.processing = False
+#         db.session.add_all(embeddings)
+#         db.session.commit()
+#     except SQLAlchemyError as e:
+#         current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
+#                                  f'on Youtube document version {document_version.id}'
+#                                  f'error: {e}')
+#         raise
+#
+#     current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
+#                             f'on Youtube document version {document_version.id} :-)')
+#
+#
+# def download_youtube(url, tenant_id, document_version, file_name):
+#     try:
+#         current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
+#         yt = YouTube(url)
+#         stream = yt.streams.get_audio_only()
+#
+#         with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+#             stream.download(output_path=temp_file.name)
+#             with open(temp_file.name, 'rb') as f:
+#                 file_data = f.read()
+#
+#         minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
+#                                           document_version.id,
+#                                           file_name, file_data)
+#
+#         current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
+#         return file_name, yt.title, yt.description, yt.author
+#     except Exception as e:
+#         current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
+#         raise
+#
+#
+# def compress_audio(tenant_id, document_version, input_file, output_file):
+#     try:
+#         current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
+#
+#         input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
+#                                                          document_version.id, input_file)
+#
+#         with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
+#             temp_input.write(input_data)
+#             temp_input.flush()
+#
+#             with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
+#                 result = subprocess.run(
+#                     ['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
+#                     capture_output=True,
+#                     text=True
+#                 )
+#
+#                 if result.returncode != 0:
+#                     raise Exception(f"Compression failed: {result.stderr}")
+#
+#                 with open(temp_output.name, 'rb') as f:
+#                     compressed_data = f.read()
+#
+#         minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
+#                                           document_version.id,
+#                                           output_file, compressed_data)
+#
+#         current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
+#     except Exception as e:
+#         current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
+#         raise
+#
+#
+# def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
+#     try:
+#         current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
+#         client = model_variables['transcription_client']
+#         model = model_variables['transcription_model']
+#
+#         # Download the audio file from MinIO
+#         audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
+#                                                          document_version.id, input_file)
+#
+#         # Load the audio data into pydub
+#         audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
+#
+#         # Define segment length (e.g., 10 minutes)
+#         segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
+#
+#         transcriptions = []
+#
+#         # Split audio into segments and transcribe each
+#         for i, chunk in enumerate(audio[::segment_length]):
+#             current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
+#
+#             with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
+#                 chunk.export(temp_audio.name, format="mp3")
+#
+#                 with open(temp_audio.name, 'rb') as audio_segment:
+#                     transcription = client.audio.transcriptions.create(
+#                         file=audio_segment,
+#                         model=model,
+#                         language=document_version.language,
+#                         response_format='verbose_json',
+#                     )
+#
+#                 transcriptions.append(transcription.text)
+#
+#             os.unlink(temp_audio.name)  # Delete the temporary file
+#
+#         # Combine all transcriptions
+#         full_transcription = " ".join(transcriptions)
+#
+#         # Upload the full transcription to MinIO
+#         minio_client.upload_document_file(
+#             tenant_id,
+#             document_version.doc_id,
+#             document_version.language,
+#             document_version.id,
+#             output_file,
+#             full_transcription.encode('utf-8')
+#         )
+#
+#         current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
+#     except Exception as e:
+#         current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
+#         raise
+#
+#
+# def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
+#     try:
+#         current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')
+#
+#         char_splitter = CharacterTextSplitter(separator='.',
+#                                               chunk_size=model_variables['annotation_chunk_length'],
+#                                               chunk_overlap=0)
+#
+#         headers_to_split_on = [
+#             ("#", "Header 1"),
+#             ("##", "Header 2"),
+#         ]
+#         markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
+#
+#         llm = model_variables['llm']
+#         template = model_variables['transcript_template']
+#         language_template = create_language_template(template, document_version.language)
+#         transcript_prompt = ChatPromptTemplate.from_template(language_template)
+#         setup = RunnablePassthrough()
+#         output_parser = StrOutputParser()
+#
+#         # Download the transcription file from MinIO
+#         transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
+#                                                               document_version.language, document_version.id,
+#                                                               input_file)
+#         transcript = transcript_data.decode('utf-8')
+#
+#         chain = setup | transcript_prompt | llm | output_parser
+#
+#         chunks = char_splitter.split_text(transcript)
+#         all_markdown_chunks = []
+#         last_markdown_chunk = ''
+#         for chunk in chunks:
+#             current_app.logger.debug(f'Annotating next chunk of {len(chunks)} for tenant {tenant.id}')
+#             full_input = last_markdown_chunk + '\n' + chunk
+#             if tenant.embed_tuning:
+#                 current_app.embed_tuning_logger.debug(f'Annotating chunk: \n '
+#                                                       f'------------------\n'
+#                                                       f'{full_input}\n'
+#                                                       f'------------------\n')
+#             input_transcript = {'transcript': full_input}
+#             markdown = chain.invoke(input_transcript)
+#             # GPT-4o returns some kind of content description: ```markdown <text> ```
+#             if markdown.startswith("```markdown"):
+#                 markdown = "\n".join(markdown.strip().split("\n")[1:-1])
+#             if tenant.embed_tuning:
+#                 current_app.embed_tuning_logger.debug(f'Markdown Received: \n '
+#                                                       f'------------------\n'
+#                                                       f'{markdown}\n'
+#                                                       f'------------------\n')
+#             md_header_splits = markdown_splitter.split_text(markdown)
+#             markdown_chunks = [doc.page_content for doc in md_header_splits]
+#             # claude-3.5-sonnet returns introductory text
+#             if not markdown_chunks[0].startswith('#'):
+#                 markdown_chunks.pop(0)
+#             last_markdown_chunk = markdown_chunks[-1]
+#             last_markdown_chunk = "\n".join(markdown.strip().split("\n")[1:])
+#             markdown_chunks.pop()
+#             all_markdown_chunks += markdown_chunks
+#
+#         all_markdown_chunks += [last_markdown_chunk]
+#
+#         annotated_transcript = '\n'.join(all_markdown_chunks)
+#
+#         # Upload the annotated transcript to MinIO
+#         minio_client.upload_document_file(
+#             tenant.id,
+#             document_version.doc_id,
+#             document_version.language,
+#             document_version.id,
+#             output_file,
+#             annotated_transcript.encode('utf-8')
+#         )
+#
+#         current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
+#     except Exception as e:
+#         current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
+#         raise


 def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
@@ -779,4 +561,3 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
        actual_chunks.append(current_chunk)

    return actual_chunks
-