- Adding a Tenant Type

- Allow filtering on Tenant Types & searching for parts of Tenant names
- Implement health checks
- Start Prometheus monitoring (needs to be finalized)
- Refine audio_processor and srt_processor to reduce duplicate code and support for larger files
- Introduce repopack to reason in LLMs about the code
This commit is contained in:
Josako
2024-09-13 15:43:40 +02:00
parent 9e14824249
commit 6cf660e622
41 changed files with 687 additions and 579 deletions

View File

@@ -1,37 +1,19 @@
import re
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .processor import Processor
from .transcription_processor import TranscriptionProcessor
import re
class SRTProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
def process(self):
self._log("Starting SRT processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
self.document_version.file_name
)
srt_content = file_data.decode('utf-8')
cleaned_transcription = self._clean_srt(srt_content)
markdown, title = self._generate_markdown_from_transcription(cleaned_transcription)
self._save_markdown(markdown)
self._log("Finished processing SRT")
return markdown, title
except Exception as e:
self._log(f"Error processing SRT: {str(e)}", level='error')
raise
class SRTProcessor(TranscriptionProcessor):
def _get_transcription(self):
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
self.document_version.file_name
)
srt_content = file_data.decode('utf-8')
return self._clean_srt(srt_content)
def _clean_srt(self, srt_content):
# Remove timecodes and subtitle numbers
@@ -50,31 +32,3 @@ class SRTProcessor(Processor):
return cleaned_text
def _generate_markdown_from_transcription(self, transcription):
self._log("Generating markdown from transcription")
llm = self.model_variables['llm']
template = self.model_variables['transcript_template']
language_template = create_language_template(template, self.document_version.language)
transcript_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | transcript_prompt | llm | output_parser
input_transcript = {'transcript': transcription}
markdown = chain.invoke(input_transcript)
# Extract title from the markdown
title = self._extract_title_from_markdown(markdown)
return markdown, title
def _extract_title_from_markdown(self, markdown):
# Simple extraction of the first header as the title
lines = markdown.split('\n')
for line in lines:
if line.startswith('# '):
return line[2:].strip()
return "Untitled SRT Transcription"