- Adding a Tenant Type

- Allow filtering on Tenant Types & searching for parts of Tenant names - Implement health checks - Start Prometheus monitoring (needs to be finalized) - Refine audio_processor and srt_processor to reduce duplicate code and support for larger files - Introduce repopack to reason in LLMs about the code
2024-09-13 15:43:40 +02:00
parent 9e14824249
commit 6cf660e622
41 changed files with 687 additions and 579 deletions
--- a/eveai_workers/Processors/pdf_processor.py
+++ b/eveai_workers/Processors/pdf_processor.py
@@ -16,10 +16,10 @@ class PDFProcessor(Processor):
    def __init__(self, tenant, model_variables, document_version):
        super().__init__(tenant, model_variables, document_version)
        # PDF-specific initialization
-        self.chunk_size = model_variables['PDF_chunk_size']
-        self.chunk_overlap = model_variables['PDF_chunk_overlap']
-        self.min_chunk_size = model_variables['PDF_min_chunk_size']
-        self.max_chunk_size = model_variables['PDF_max_chunk_size']
+        self.chunk_size = model_variables['processing_chunk_size']
+        self.chunk_overlap = model_variables['processing_chunk_overlap']
+        self.min_chunk_size = model_variables['processing_min_chunk_size']
+        self.max_chunk_size = model_variables['processing_max_chunk_size']

    def process(self):
        self._log("Starting PDF processing")
@@ -228,12 +228,7 @@ class PDFProcessor(Processor):
        for chunk in chunks:
            input = {"pdf_content": chunk}
            result = chain.invoke(input)
-            # Remove Markdown code block delimiters if present
-            result = result.strip()
-            if result.startswith("```markdown"):
-                result = result[len("```markdown"):].strip()
-            if result.endswith("```"):
-                result = result[:-3].strip()
+            result = self._clean_markdown(result)
            markdown_chunks.append(result)

        return "\n\n".join(markdown_chunks)