- Adding a Tenant Type

- Allow filtering on Tenant Types & searching for parts of Tenant names
- Implement health checks
- Start Prometheus monitoring (needs to be finalized)
- Refine audio_processor and srt_processor to reduce duplicate code and support for larger files
- Introduce repopack to reason in LLMs about the code
This commit is contained in:
Josako
2024-09-13 15:43:40 +02:00
parent 9e14824249
commit 6cf660e622
41 changed files with 687 additions and 579 deletions

View File

@@ -16,10 +16,10 @@ class PDFProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
# PDF-specific initialization
self.chunk_size = model_variables['PDF_chunk_size']
self.chunk_overlap = model_variables['PDF_chunk_overlap']
self.min_chunk_size = model_variables['PDF_min_chunk_size']
self.max_chunk_size = model_variables['PDF_max_chunk_size']
self.chunk_size = model_variables['processing_chunk_size']
self.chunk_overlap = model_variables['processing_chunk_overlap']
self.min_chunk_size = model_variables['processing_min_chunk_size']
self.max_chunk_size = model_variables['processing_max_chunk_size']
def process(self):
self._log("Starting PDF processing")
@@ -228,12 +228,7 @@ class PDFProcessor(Processor):
for chunk in chunks:
input = {"pdf_content": chunk}
result = chain.invoke(input)
# Remove Markdown code block delimiters if present
result = result.strip()
if result.startswith("```markdown"):
result = result[len("```markdown"):].strip()
if result.endswith("```"):
result = result[:-3].strip()
result = self._clean_markdown(result)
markdown_chunks.append(result)
return "\n\n".join(markdown_chunks)