- Adding a Tenant Type

- Allow filtering on Tenant Types & searching for parts of Tenant names
- Implement health checks
- Start Prometheus monitoring (needs to be finalized)
- Refine audio_processor and srt_processor to reduce duplicate code and support for larger files
- Introduce repopack to reason in LLMs about the code
This commit is contained in:
Josako
2024-09-13 15:43:40 +02:00
parent 9e14824249
commit 6cf660e622
41 changed files with 687 additions and 579 deletions

View File

@@ -0,0 +1,90 @@
# transcription_processor.py
from common.utils.model_utils import create_language_template
from .processor import Processor
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
class TranscriptionProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
self.chunk_size = model_variables['processing_chunk_size']
self.chunk_overlap = model_variables['processing_chunk_overlap']
def process(self):
self._log("Starting Transcription processing")
try:
transcription = self._get_transcription()
chunks = self._chunk_transcription(transcription)
markdown_chunks = self._process_chunks(chunks)
full_markdown = self._combine_markdown_chunks(markdown_chunks)
self._save_markdown(full_markdown)
self._log("Finished processing Transcription")
return full_markdown, self._extract_title_from_markdown(full_markdown)
except Exception as e:
self._log(f"Error processing Transcription: {str(e)}", level='error')
raise
def _get_transcription(self):
# This method should be implemented by child classes
raise NotImplementedError
def _chunk_transcription(self, transcription):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
return text_splitter.split_text(transcription)
def _process_chunks(self, chunks):
self._log("Generating markdown from transcription")
llm = self.model_variables['llm']
template = self.model_variables['transcript_template']
language_template = create_language_template(template, self.document_version.language)
transcript_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | transcript_prompt | llm | output_parser
markdown_chunks = []
previous_part = ""
for i, chunk in enumerate(chunks):
self._log(f"Processing chunk {i + 1} of {len(chunks)}")
self._log(f"Previous part: {previous_part}")
input_transcript = {
'transcript': chunk,
'previous_part': previous_part
}
markdown = chain.invoke(input_transcript)
markdown = self._clean_markdown(markdown)
markdown_chunks.append(markdown)
# Extract the last part for the next iteration
lines = markdown.split('\n')
last_header = None
for line in reversed(lines):
if line.startswith('#'):
last_header = line
break
if last_header:
header_index = lines.index(last_header)
previous_part = '\n'.join(lines[header_index:])
else:
previous_part = lines[-1] if lines else ""
return markdown_chunks
def _combine_markdown_chunks(self, markdown_chunks):
return "\n\n".join(markdown_chunks)
def _extract_title_from_markdown(self, markdown):
lines = markdown.split('\n')
for line in lines:
if line.startswith('# '):
return line[2:].strip()
return "Untitled Transcription"