- Adding a Tenant Type

- Allow filtering on Tenant Types & searching for parts of Tenant names - Implement health checks - Start Prometheus monitoring (needs to be finalized) - Refine audio_processor and srt_processor to reduce duplicate code and support for larger files - Introduce repopack to reason in LLMs about the code
2024-09-13 15:43:40 +02:00
parent 9e14824249
commit 6cf660e622
41 changed files with 687 additions and 579 deletions
--- a/eveai_workers/Processors/audio_processor.py
+++ b/eveai_workers/Processors/audio_processor.py
@@ -1,45 +1,31 @@
 import io
 import os
+
 from pydub import AudioSegment
 import tempfile
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.runnables import RunnablePassthrough
 from common.extensions import minio_client
-from common.utils.model_utils import create_language_template
-from .processor import Processor
 import subprocess

+from .transcription_processor import TranscriptionProcessor

-class AudioProcessor(Processor):
+
+class AudioProcessor(TranscriptionProcessor):
    def __init__(self, tenant, model_variables, document_version):
        super().__init__(tenant, model_variables, document_version)
        self.transcription_client = model_variables['transcription_client']
        self.transcription_model = model_variables['transcription_model']
        self.ffmpeg_path = 'ffmpeg'

-
-    def process(self):
-        self._log("Starting Audio processing")
-        try:
-            file_data = minio_client.download_document_file(
-                self.tenant.id,
-                self.document_version.doc_id,
-                self.document_version.language,
-                self.document_version.id,
-                self.document_version.file_name
-            )
-
-            compressed_audio = self._compress_audio(file_data)
-            transcription = self._transcribe_audio(compressed_audio)
-            markdown, title = self._generate_markdown_from_transcription(transcription)
-
-            self._save_markdown(markdown)
-            self._log("Finished processing Audio")
-            return markdown, title
-        except Exception as e:
-            self._log(f"Error processing Audio: {str(e)}", level='error')
-            raise
+    def _get_transcription(self):
+        file_data = minio_client.download_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            self.document_version.file_name
+        )
+        compressed_audio = self._compress_audio(file_data)
+        return self._transcribe_audio(compressed_audio)

    def _compress_audio(self, audio_data):
        self._log("Compressing audio")
@@ -159,29 +145,3 @@ class AudioProcessor(Processor):

        return full_transcription

-    def _generate_markdown_from_transcription(self, transcription):
-        self._log("Generating markdown from transcription")
-        llm = self.model_variables['llm']
-        template = self.model_variables['transcript_template']
-        language_template = create_language_template(template, self.document_version.language)
-        transcript_prompt = ChatPromptTemplate.from_template(language_template)
-        setup = RunnablePassthrough()
-        output_parser = StrOutputParser()
-
-        chain = setup | transcript_prompt | llm | output_parser
-
-        input_transcript = {'transcript': transcription}
-        markdown = chain.invoke(input_transcript)
-
-        # Extract title from the markdown
-        title = self._extract_title_from_markdown(markdown)
-
-        return markdown, title
-
-    def _extract_title_from_markdown(self, markdown):
-        # Simple extraction of the first header as the title
-        lines = markdown.split('\n')
-        for line in lines:
-            if line.startswith('# '):
-                return line[2:].strip()
-        return "Untitled Audio Transcription"
--- a/eveai_workers/Processors/html_processor.py
+++ b/eveai_workers/Processors/html_processor.py
@@ -14,6 +14,9 @@ class HTMLProcessor(Processor):
        self.html_end_tags = model_variables['html_end_tags']
        self.html_included_elements = model_variables['html_included_elements']
        self.html_excluded_elements = model_variables['html_excluded_elements']
+        self.chunk_size = model_variables['processing_chunk_size']  # Adjust this based on your LLM's optimal input size
+        self.chunk_overlap = model_variables[
+            'processing_chunk_overlap']  # Adjust for context preservation between chunks

    def process(self):
        self._log("Starting HTML processing")
@@ -70,7 +73,7 @@ class HTMLProcessor(Processor):
        chain = setup | parse_prompt | llm | output_parser

        soup = BeautifulSoup(html_content, 'lxml')
-        chunks = self._split_content(soup)
+        chunks = self._split_content(soup, self.chunk_size)

        markdown_chunks = []
        for chunk in chunks:
--- a/eveai_workers/Processors/pdf_processor.py
+++ b/eveai_workers/Processors/pdf_processor.py
@@ -16,10 +16,10 @@ class PDFProcessor(Processor):
    def __init__(self, tenant, model_variables, document_version):
        super().__init__(tenant, model_variables, document_version)
        # PDF-specific initialization
-        self.chunk_size = model_variables['PDF_chunk_size']
-        self.chunk_overlap = model_variables['PDF_chunk_overlap']
-        self.min_chunk_size = model_variables['PDF_min_chunk_size']
-        self.max_chunk_size = model_variables['PDF_max_chunk_size']
+        self.chunk_size = model_variables['processing_chunk_size']
+        self.chunk_overlap = model_variables['processing_chunk_overlap']
+        self.min_chunk_size = model_variables['processing_min_chunk_size']
+        self.max_chunk_size = model_variables['processing_max_chunk_size']

    def process(self):
        self._log("Starting PDF processing")
@@ -228,12 +228,7 @@ class PDFProcessor(Processor):
        for chunk in chunks:
            input = {"pdf_content": chunk}
            result = chain.invoke(input)
-            # Remove Markdown code block delimiters if present
-            result = result.strip()
-            if result.startswith("```markdown"):
-                result = result[len("```markdown"):].strip()
-            if result.endswith("```"):
-                result = result[:-3].strip()
+            result = self._clean_markdown(result)
            markdown_chunks.append(result)

        return "\n\n".join(markdown_chunks)
--- a/eveai_workers/Processors/processor.py
+++ b/eveai_workers/Processors/processor.py
@@ -40,3 +40,13 @@ class Processor(ABC):
            filename,
            content.encode('utf-8')
        )
+
+    def _clean_markdown(self, markdown):
+        markdown = markdown.strip()
+        if markdown.startswith("```markdown"):
+            markdown = markdown[len("```markdown"):].strip()
+        if markdown.endswith("```"):
+            markdown = markdown[:-3].strip()
+
+        return markdown
+
--- a/eveai_workers/Processors/srt_processor.py
+++ b/eveai_workers/Processors/srt_processor.py
@@ -1,37 +1,19 @@
-import re
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.runnables import RunnablePassthrough
 from common.extensions import minio_client
-from common.utils.model_utils import create_language_template
-from .processor import Processor
+from .transcription_processor import TranscriptionProcessor
+import re


-class SRTProcessor(Processor):
-    def __init__(self, tenant, model_variables, document_version):
-        super().__init__(tenant, model_variables, document_version)
-
-    def process(self):
-        self._log("Starting SRT processing")
-        try:
-            file_data = minio_client.download_document_file(
-                self.tenant.id,
-                self.document_version.doc_id,
-                self.document_version.language,
-                self.document_version.id,
-                self.document_version.file_name
-            )
-
-            srt_content = file_data.decode('utf-8')
-            cleaned_transcription = self._clean_srt(srt_content)
-            markdown, title = self._generate_markdown_from_transcription(cleaned_transcription)
-
-            self._save_markdown(markdown)
-            self._log("Finished processing SRT")
-            return markdown, title
-        except Exception as e:
-            self._log(f"Error processing SRT: {str(e)}", level='error')
-            raise
+class SRTProcessor(TranscriptionProcessor):
+    def _get_transcription(self):
+        file_data = minio_client.download_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            self.document_version.file_name
+        )
+        srt_content = file_data.decode('utf-8')
+        return self._clean_srt(srt_content)

    def _clean_srt(self, srt_content):
        # Remove timecodes and subtitle numbers
@@ -50,31 +32,3 @@ class SRTProcessor(Processor):

        return cleaned_text

-    def _generate_markdown_from_transcription(self, transcription):
-        self._log("Generating markdown from transcription")
-        llm = self.model_variables['llm']
-        template = self.model_variables['transcript_template']
-        language_template = create_language_template(template, self.document_version.language)
-        transcript_prompt = ChatPromptTemplate.from_template(language_template)
-        setup = RunnablePassthrough()
-        output_parser = StrOutputParser()
-
-        chain = setup | transcript_prompt | llm | output_parser
-
-        input_transcript = {'transcript': transcription}
-        markdown = chain.invoke(input_transcript)
-
-        # Extract title from the markdown
-        title = self._extract_title_from_markdown(markdown)
-
-        return markdown, title
-
-    def _extract_title_from_markdown(self, markdown):
-        # Simple extraction of the first header as the title
-        lines = markdown.split('\n')
-        for line in lines:
-            if line.startswith('# '):
-                return line[2:].strip()
-        return "Untitled SRT Transcription"
-
-
--- a/eveai_workers/Processors/transcription_processor.py
+++ b/eveai_workers/Processors/transcription_processor.py
@@ -0,0 +1,90 @@
+# transcription_processor.py
+from common.utils.model_utils import create_language_template
+from .processor import Processor
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+
+
+class TranscriptionProcessor(Processor):
+    def __init__(self, tenant, model_variables, document_version):
+        super().__init__(tenant, model_variables, document_version)
+        self.chunk_size = model_variables['processing_chunk_size']
+        self.chunk_overlap = model_variables['processing_chunk_overlap']
+
+    def process(self):
+        self._log("Starting Transcription processing")
+        try:
+            transcription = self._get_transcription()
+            chunks = self._chunk_transcription(transcription)
+            markdown_chunks = self._process_chunks(chunks)
+            full_markdown = self._combine_markdown_chunks(markdown_chunks)
+            self._save_markdown(full_markdown)
+            self._log("Finished processing Transcription")
+            return full_markdown, self._extract_title_from_markdown(full_markdown)
+        except Exception as e:
+            self._log(f"Error processing Transcription: {str(e)}", level='error')
+            raise
+
+    def _get_transcription(self):
+        # This method should be implemented by child classes
+        raise NotImplementedError
+
+    def _chunk_transcription(self, transcription):
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+        return text_splitter.split_text(transcription)
+
+    def _process_chunks(self, chunks):
+        self._log("Generating markdown from transcription")
+        llm = self.model_variables['llm']
+        template = self.model_variables['transcript_template']
+        language_template = create_language_template(template, self.document_version.language)
+        transcript_prompt = ChatPromptTemplate.from_template(language_template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+
+        chain = setup | transcript_prompt | llm | output_parser
+
+        markdown_chunks = []
+        previous_part = ""
+        for i, chunk in enumerate(chunks):
+            self._log(f"Processing chunk {i + 1} of {len(chunks)}")
+            self._log(f"Previous part: {previous_part}")
+            input_transcript = {
+                'transcript': chunk,
+                'previous_part': previous_part
+            }
+            markdown = chain.invoke(input_transcript)
+            markdown = self._clean_markdown(markdown)
+            markdown_chunks.append(markdown)
+
+            # Extract the last part for the next iteration
+            lines = markdown.split('\n')
+            last_header = None
+            for line in reversed(lines):
+                if line.startswith('#'):
+                    last_header = line
+                    break
+            if last_header:
+                header_index = lines.index(last_header)
+                previous_part = '\n'.join(lines[header_index:])
+            else:
+                previous_part = lines[-1] if lines else ""
+
+        return markdown_chunks
+
+    def _combine_markdown_chunks(self, markdown_chunks):
+        return "\n\n".join(markdown_chunks)
+
+    def _extract_title_from_markdown(self, markdown):
+        lines = markdown.split('\n')
+        for line in lines:
+            if line.startswith('# '):
+                return line[2:].strip()
+        return "Untitled Transcription"
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -25,6 +25,12 @@ from eveai_workers.Processors.pdf_processor import PDFProcessor
 from eveai_workers.Processors.srt_processor import SRTProcessor


+# Healthcheck task
+@current_celery.task(name='ping', queue='embeddings')
+def ping():
+    return 'pong'
+
+
@current_celery.task(name='create_embeddings', queue='embeddings')
 def create_embeddings(tenant_id, document_version_id):
    current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.')
@@ -184,14 +190,21 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):

    chunk_total_context = (f'Filename: {document_version.file_name}\n'
                           f'User Context:\n{document_version.user_context}\n\n'
+                           f'User Metadata:\n{document_version.user_metadata}\n\n'
                           f'Title: {title}\n'
-                           f'{summary}\n'
-                           f'{document_version.system_context}\n\n')
+                           f'Summary:\n{summary}\n'
+                           f'System Context:\n{document_version.system_context}\n\n'
+                           f'System Metadata:\n{document_version.system_metadata}\n\n'
+                           )
    enriched_chunks = []
    initial_chunk = (f'Filename: {document_version.file_name}\n'
                     f'User Context:\n{document_version.user_context}\n\n'
+                     f'User Metadata:\n{document_version.user_metadata}\n\n'
                     f'Title: {title}\n'
-                     f'{chunks[0]}')
+                     f'System Context:\n{document_version.system_context}\n\n'
+                     f'System Metadata:\n{document_version.system_metadata}\n\n'
+                     f'{chunks[0]}'
+                     )

    enriched_chunks.append(initial_chunk)
    for chunk in chunks[1:]: