- Introduction of dynamic Retrievers & Specialists

- Introduction of dynamic Processors
- Introduction of caching system
- Introduction of a better template manager
- Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists
- Start adaptation of chat client
This commit is contained in:
Josako
2024-11-15 10:00:53 +01:00
parent 55a8a95f79
commit 1807435339
101 changed files with 4181 additions and 1764 deletions

View File

@@ -0,0 +1,5 @@
# Import all processor implementations to ensure registration
from . import audio_processor, html_processor, pdf_processor
# List of all available processor implementations
__all__ = ['audio_processor', 'html_processor', 'pdf_processor']

View File

@@ -0,0 +1,211 @@
import io
import os
import time
import psutil
from pydub import AudioSegment
import tempfile
from common.extensions import minio_client
import subprocess
from .processor_registry import ProcessorRegistry
from .transcription_processor import TranscriptionBaseProcessor
from common.utils.business_event_context import current_event
class AudioProcessor(TranscriptionBaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.transcription_model = model_variables.transcription_model
self.ffmpeg_path = 'ffmpeg'
self.max_compression_duration = model_variables.max_compression_duration
self.max_transcription_duration = model_variables.max_transcription_duration
self.compression_cpu_limit = model_variables.compression_cpu_limit # CPU usage limit in percentage
self.compression_process_delay = model_variables.compression_process_delay # Delay between processing chunks in seconds
self.file_type = document_version.file_type
def _get_transcription(self):
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
with current_event.create_span("Audio Compression"):
compressed_audio = self._compress_audio(file_data)
with current_event.create_span("Audio Transcription"):
transcription = self._transcribe_audio(compressed_audio)
return transcription
def _compress_audio(self, audio_data):
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_file:
temp_file.write(audio_data)
temp_file_path = temp_file.name
try:
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
total_duration = len(audio_info)
self._log_tuning("_compress_audio", {
"Audio Duration (ms)": total_duration,
})
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
total_chunks = (total_duration + segment_length - 1) // segment_length
compressed_segments = AudioSegment.empty()
for i in range(total_chunks):
self._log_tuning("_compress_audio", {
"Segment Nr": f"{i + 1} of {total_chunks}"
})
start_time = i * segment_length
end_time = min((i + 1) * segment_length, total_duration)
chunk = AudioSegment.from_file(
temp_file_path,
format=self.document_version.file_type,
start_second=start_time / 1000,
duration=(end_time - start_time) / 1000
)
compressed_chunk = self._compress_segment(chunk)
compressed_segments += compressed_chunk
time.sleep(self.compression_process_delay)
# Save compressed audio to MinIO
compressed_filename = f"{self.document_version.id}_compressed.mp3"
with io.BytesIO() as compressed_buffer:
compressed_segments.export(compressed_buffer, format="mp3")
compressed_buffer.seek(0)
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
compressed_filename,
compressed_buffer.read()
)
self._log_tuning("_compress_audio", {
"Compressed audio to MinIO": compressed_filename
})
return compressed_segments
except Exception as e:
self._log(f"Error during audio processing: {str(e)}", level='error')
raise
finally:
os.unlink(temp_file_path) # Ensure the temporary file is deleted
def _compress_segment(self, audio_segment):
with io.BytesIO() as segment_buffer:
audio_segment.export(segment_buffer, format="wav")
segment_buffer.seek(0)
with io.BytesIO() as output_buffer:
command = [
'nice', '-n', '19',
'ffmpeg',
'-i', 'pipe:0',
'-ar', '16000',
'-ac', '1',
'-b:a', '32k',
'-filter:a', 'loudnorm',
'-f', 'mp3',
'pipe:1'
]
process = psutil.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate(input=segment_buffer.read())
if process.returncode != 0:
self._log(f"FFmpeg error: {stderr.decode()}", level='error')
raise Exception("FFmpeg compression failed")
output_buffer.write(stdout)
output_buffer.seek(0)
compressed_segment = AudioSegment.from_mp3(output_buffer)
return compressed_segment
def _transcribe_audio(self, audio_data):
# audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
audio = audio_data
segment_length = self.max_transcription_duration * 1000 # calculate milliseconds
transcriptions = []
total_chunks = len(audio) // segment_length + 1
for i, chunk in enumerate(audio[::segment_length]):
segment_duration = 0
if i == total_chunks - 1:
segment_duration = (len(audio) % segment_length) // 1000
else:
segment_duration = self.max_transcription_duration
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
chunk.export(temp_audio.name, format="mp3")
temp_audio.flush()
try:
file_size = os.path.getsize(temp_audio.name)
with open(temp_audio.name, 'rb') as audio_file:
transcription = self.model_variables.transcription_model.transcribe(
file=audio_file,
language=self.document_version.language,
response_format='verbose_json',
duration=segment_duration
)
if transcription:
trans = ""
# Handle the transcription result based on its type
if isinstance(transcription, str):
trans = transcription
elif hasattr(transcription, 'text'):
trans = transcription.text
else:
transcriptions.append(str(transcription))
transcriptions.append(trans)
self._log_tuning("_transcribe_audio", {
"Chunk Nr": f"{i + 1} of {total_chunks}",
"Segment Duration": segment_duration,
"Transcription": trans,
})
else:
self._log("Warning: Received empty transcription", level='warning')
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
except Exception as e:
self._log(f"Error during transcription: {str(e)}", level='error')
finally:
os.unlink(temp_audio.name)
full_transcription = " ".join(filter(None, transcriptions))
if not full_transcription:
self._log("Warning: No transcription was generated", level='warning')
full_transcription = "No transcription available."
# Save transcription to MinIO
transcription_filename = f"{self.document_version.id}_transcription.txt"
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
transcription_filename,
full_transcription.encode('utf-8')
)
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
return full_transcription
# Register the processor
ProcessorRegistry.register("AUDIO_PROCESSOR", AudioProcessor)

View File

@@ -0,0 +1,88 @@
from abc import ABC, abstractmethod
from typing import Dict, Any
from flask import current_app
from common.extensions import minio_client
from config.logging_config import TuningLogger
class BaseProcessor(ABC):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
self.tenant = tenant
self.model_variables = model_variables
self.document_version = document_version
self.catalog = catalog
self.processor = processor
self.tuning = processor.tuning if processor else False
self.tuning_logger = None
self._setup_tuning_logger()
self._log_tuning("Processor initialized", {
"processor_type": processor.type if processor else None,
"document_version": document_version.id if document_version else None,
"catalog": catalog.id if catalog else None
})
def _setup_tuning_logger(self):
try:
self.tuning_logger = TuningLogger(
'tuning',
tenant_id=self.tenant.id if self.tenant else None,
catalog_id=self.catalog.id if self.catalog else None,
processor_id=self.processor.id if self.processor else None,
)
# Verify logger is working with a test message
if self.tuning:
self.tuning_logger.log_tuning('processor', "Tuning logger initialized")
except Exception as e:
current_app.logger.error(f"Failed to setup tuning logger: {str(e)}")
raise
@abstractmethod
def process(self):
pass
def _save_markdown(self, markdown):
markdown_filename = f"{self.document_version.id}.md"
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
markdown_filename,
markdown.encode('utf-8')
)
def _log(self, message, level='debug'):
logger = current_app.logger
log_method = getattr(logger, level)
log_method(
f"{self.__class__.__name__} - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
def _save_intermediate(self, content, filename):
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
filename,
content.encode('utf-8')
)
def _clean_markdown(self, markdown):
markdown = markdown.strip()
if markdown.startswith("```markdown"):
markdown = markdown[len("```markdown"):].strip()
if markdown.endswith("```"):
markdown = markdown[:-3].strip()
return markdown
def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
if self.tuning and self.tuning_logger:
try:
self.tuning_logger.log_tuning('processor', message, data)
except Exception as e:
current_app.logger.error(f"Processor: Error in tuning logging: {e}")

View File

@@ -0,0 +1,163 @@
from bs4 import BeautifulSoup
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.extensions import db, minio_client
from common.utils.model_utils import create_language_template
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
from common.utils.string_list_converter import StringListConverter as SLC
class HTMLProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
cat_conf = catalog.configuration
proc_conf = processor.configuration
self.html_tags = SLC.string_to_list(proc_conf['html_tags'])
self.html_end_tags = SLC.string_to_list(proc_conf['html_end_tags'])
self.html_included_elements = SLC.string_to_list(proc_conf['html_included_elements'])
self.html_excluded_elements = SLC.string_to_list(proc_conf['html_excluded_elements'])
self.html_excluded_classes = SLC.string_to_list(proc_conf['html_excluded_classes'])
self.tuning = self.processor.tuning
# Add verification logging
self._log(f"HTML Processor initialized with tuning={self.tuning}")
if self.tuning:
self._log_tuning("HTML Processor initialized", {
"html_tags": self.html_tags,
"html_end_tags": self.html_end_tags,
"included_elements": self.html_included_elements,
"excluded_elements": self.html_excluded_elements
})
self.chunk_size = catalog.max_chunk_size
def process(self):
self._log("Starting HTML processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
html_content = file_data.decode('utf-8')
with current_event.create_span("HTML Content Extraction"):
extracted_html, title = self._parse_html(html_content)
with current_event.create_span("Markdown Generation"):
markdown = self._generate_markdown_from_html(extracted_html)
self._save_markdown(markdown)
self._log("Finished processing HTML")
return markdown, title
except Exception as e:
self._log(f"Error processing HTML: {str(e)}", level='error')
raise
def _parse_html(self, html_content):
self._log(f'Parsing HTML for tenant {self.tenant.id}')
soup = BeautifulSoup(html_content, 'html.parser')
extracted_html = ''
excluded_classes = self._parse_excluded_classes(self.html_excluded_classes)
if self.html_included_elements:
elements_to_parse = soup.find_all(self.html_included_elements)
else:
elements_to_parse = [soup]
for element in elements_to_parse:
for sub_element in element.find_all(self.html_tags):
if self._should_exclude_element(sub_element, excluded_classes):
continue
extracted_html += self._extract_element_content(sub_element)
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
return extracted_html, title
def _generate_markdown_from_html(self, html_content):
self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')
llm = self.model_variables.get_llm()
template = self.model_variables.get_template("html_parse")
parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | parse_prompt | llm | output_parser
soup = BeautifulSoup(html_content, 'lxml')
chunks = self._split_content(soup, self.chunk_size)
markdown_chunks = []
for chunk in chunks:
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
markdown = "\n\n".join(markdown_chunks)
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
return markdown
def _split_content(self, soup, max_size=20000):
chunks = []
current_chunk = []
current_size = 0
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
element_html = str(element)
element_size = len(element_html)
if current_size + element_size > max_size and current_chunk:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
current_chunk.append(element)
current_size += element_size
if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
if current_chunk:
chunks.append(''.join(map(str, current_chunk)))
return chunks
def _parse_excluded_classes(self, excluded_classes):
parsed = {}
if excluded_classes:
for rule in excluded_classes:
element, cls = rule.split('.', 1)
parsed.setdefault(element, set()).add(cls)
return parsed
def _should_exclude_element(self, element, excluded_classes):
if self.html_excluded_elements and element.find_parent(self.html_excluded_elements):
return True
return self._is_element_excluded_by_class(element, excluded_classes)
def _is_element_excluded_by_class(self, element, excluded_classes):
for parent in element.parents:
if self._element_matches_exclusion(parent, excluded_classes):
return True
return self._element_matches_exclusion(element, excluded_classes)
def _element_matches_exclusion(self, element, excluded_classes):
if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
return True
return element.name in excluded_classes and \
any(cls in excluded_classes[element.name] for cls in element.get('class', []))
def _extract_element_content(self, element):
content = ' '.join(child.strip() for child in element.stripped_strings)
return f'<{element.name}>{content}</{element.name}>\n'
# Register the processor
ProcessorRegistry.register("HTML_PROCESSOR", HTMLProcessor)

View File

@@ -0,0 +1,231 @@
import io
import pdfplumber
from flask import current_app
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
class PDFProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
def process(self):
self._log("Starting PDF processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
with current_event.create_span("PDF Extraction"):
extracted_content = self._extract_content(file_data)
structured_content, title = self._structure_content(extracted_content)
with current_event.create_span("Markdown Generation"):
llm_chunks = self._split_content_for_llm(structured_content)
markdown = self._process_chunks_with_llm(llm_chunks)
self._save_markdown(markdown)
self._log("Finished processing PDF")
return markdown, title
except Exception as e:
self._log(f"Error processing PDF: {str(e)}", level='error')
raise
def _extract_content(self, file_data):
extracted_content = []
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
figure_counter = 1
for page_num, page in enumerate(pdf.pages):
self._log(f"Extracting content from page {page_num + 1}")
page_content = {
'text': page.extract_text(),
'figures': self._extract_figures(page, page_num, figure_counter),
'tables': self._extract_tables(page)
}
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
figure_counter += len(page_content['figures'])
extracted_content.append(page_content)
return extracted_content
def _extract_figures(self, page, page_num, figure_counter):
figures = []
# Omit figure processing for now!
# for img in page.images:
# try:
# # Try to get the bbox, use full page dimensions if not available
# bbox = img.get('bbox', (0, 0, page.width, page.height))
#
# figure = {
# 'figure_number': figure_counter,
# 'filename': f"figure_{page_num + 1}_{figure_counter}.png",
# 'caption': self._find_figure_caption(page, bbox)
# }
#
# # Extract the figure as an image
# figure_image = page.within_bbox(bbox).to_image()
#
# # Save the figure using MinIO
# with io.BytesIO() as output:
# figure_image.save(output, format='PNG')
# output.seek(0)
# minio_client.upload_document_file(
# self.tenant.id,
# self.document_version.doc_id,
# self.document_version.language,
# self.document_version.id,
# figure['filename'],
# output.getvalue()
# )
#
# figures.append(figure)
# figure_counter += 1
# except Exception as e:
# self._log(f"Error processing figure on page {page_num + 1}: {str(e)}", level='error')
return figures
def _find_figure_caption(self, page, bbox):
try:
# Look for text below the figure
caption_bbox = (bbox[0], bbox[3], bbox[2], min(bbox[3] + 50, page.height))
caption_text = page.crop(caption_bbox).extract_text()
if caption_text and caption_text.lower().startswith('figure'):
return caption_text
except Exception as e:
self._log(f"Error finding figure caption: {str(e)}", level='error')
return None
def _extract_tables(self, page):
tables = []
try:
for table in page.extract_tables():
if table:
markdown_table = self._table_to_markdown(table)
if markdown_table: # Only add non-empty tables
tables.append(markdown_table)
self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
except Exception as e:
self._log(f"Error extracting tables from page: {str(e)}", level='error')
return tables
def _table_to_markdown(self, table):
if not table or not table[0]: # Check if table is empty or first row is empty
return "" # Return empty string for empty tables
def clean_cell(cell):
if cell is None:
return "" # Convert None to empty string
return str(cell).replace("|", "\\|") # Escape pipe characters and convert to string
header = [clean_cell(cell) for cell in table[0]]
markdown = "| " + " | ".join(header) + " |\n"
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
for row in table[1:]:
cleaned_row = [clean_cell(cell) for cell in row]
markdown += "| " + " | ".join(cleaned_row) + " |\n"
return markdown
def _structure_content(self, extracted_content):
structured_content = ""
title = "Untitled Document"
current_heading_level = 0
heading_pattern = re.compile(r'^(\d+(\.\d+)*\.?\s*)?(.+)$')
def identify_heading(text):
match = heading_pattern.match(text.strip())
if match:
numbering, _, content = match.groups()
if numbering:
level = numbering.count('.') + 1
return level, f"{numbering}{content}"
else:
return 1, content # Assume it's a top-level heading if no numbering
return 0, text # Not a heading
for page in extracted_content:
# Assume the title is on the first page
if page == extracted_content[0]:
lines = page.get('text', '').split('\n')
if lines:
title = lines[0].strip() # Use the first non-empty line as the title
# Process text
paragraphs = page['text'].split('\n\n')
for para in paragraphs:
lines = para.strip().split('\n')
if len(lines) == 1: # Potential heading
level, text = identify_heading(lines[0])
if level > 0:
heading_marks = '#' * level
structured_content += f"\n\n{heading_marks} {text}\n\n"
if level == 1 and not title:
title = text # Use the first top-level heading as the title if not set
else:
structured_content += f"{para}\n\n" # Treat as normal paragraph
else:
structured_content += f"{para}\n\n" # Multi-line paragraph
# Process figures
for figure in page.get('figures', []):
structured_content += f"\n\n![Figure {figure['figure_number']}]({figure['filename']})\n\n"
if figure['caption']:
structured_content += f"*Figure {figure['figure_number']}: {figure['caption']}*\n\n"
# Add tables
if 'tables' in page:
for table in page['tables']:
structured_content += f"\n{table}\n"
if self.tuning:
self._save_intermediate(structured_content, "structured_content.md")
return structured_content, title
def _split_content_for_llm(self, content):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
return text_splitter.split_text(content)
def _process_chunks_with_llm(self, chunks):
llm = self.model_variables.get_llm()
template = self.model_variables.get_template('pdf_parse')
pdf_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | pdf_prompt | llm | output_parser
markdown_chunks = []
for chunk in chunks:
input = {"pdf_content": chunk}
result = chain.invoke(input)
result = self._clean_markdown(result)
markdown_chunks.append(result)
return "\n\n".join(markdown_chunks)
# Register the processor
ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)

View File

@@ -0,0 +1,92 @@
from typing import Dict, Type, Optional
from flask import current_app
from config.processor_types import PROCESSOR_TYPES
from .base_processor import BaseProcessor
class ProcessorRegistry:
"""Registry for processor types that aligns with PROCESSOR_TYPES configuration"""
_registry: Dict[str, Type[BaseProcessor]] = {}
@classmethod
def register(cls, processor_type: str, processor_class: Type[BaseProcessor]):
"""
Register a new processor type that must match a type in PROCESSOR_TYPES
Args:
processor_type: Type identifier from PROCESSOR_TYPES
processor_class: Processor implementation class
Raises:
ValueError: If processor_type isn't defined in PROCESSOR_TYPES
"""
if processor_type not in PROCESSOR_TYPES:
raise ValueError(f"Processor type {processor_type} not found in PROCESSOR_TYPES configuration")
cls._registry[processor_type] = processor_class
@classmethod
def get_processor_class(cls, processor_type: str) -> Type[BaseProcessor]:
"""
Get the processor class for a given processor type
Args:
processor_type: Type identifier from PROCESSOR_TYPES
Returns:
The registered processor class
Raises:
ValueError: If no processor is registered for the given type
"""
if processor_type not in cls._registry:
raise ValueError(f"No processor registered for type: {processor_type}")
return cls._registry[processor_type]
@classmethod
def get_processor_for_file_type(cls, file_type: str) -> tuple[str, Type[BaseProcessor]]:
"""
Find appropriate processor for a file type by checking PROCESSOR_TYPES definitions
Args:
file_type: File extension (e.g., 'html', 'pdf')
Returns:
Tuple of (processor_type, processor_class)
Raises:
ValueError: If no processor is found for the file type
"""
# First find which processor type handles this file type
for proc_type, config in PROCESSOR_TYPES.items():
# Check if file_type is in the supported file_types (handling both string and list formats)
supported_types = config['file_types']
if isinstance(supported_types, str):
supported_types = [t.strip() for t in supported_types.split(',')]
if file_type in supported_types:
# Get the registered processor class for this type
if proc_type in cls._registry:
return proc_type, cls._registry[proc_type]
else:
raise ValueError(
f"Found processor type {proc_type} for file type {file_type} but no processor is registered")
raise ValueError(f"No processor type found for file type: {file_type}")
@classmethod
def validate_processor_registration(cls):
"""
Validate that all PROCESSOR_TYPES have registered processors
Raises:
ValueError: If any processor type lacks a registered processor
"""
missing_processors = []
for proc_type in PROCESSOR_TYPES.keys():
if proc_type not in cls._registry:
missing_processors.append(proc_type)
if missing_processors:
raise ValueError(f"Missing processor registrations for: {', '.join(missing_processors)}")

View File

@@ -0,0 +1,32 @@
from common.extensions import minio_client
from .transcription_processor import TranscriptionBaseProcessor
import re
class SRTProcessor(TranscriptionBaseProcessor):
def _get_transcription(self):
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
srt_content = file_data.decode('utf-8')
return self._clean_srt(srt_content)
def _clean_srt(self, srt_content):
# Remove timecodes and subtitle numbers
cleaned_lines = []
for line in srt_content.split('\n'):
# Skip empty lines, subtitle numbers, and timecodes
if line.strip() and not line.strip().isdigit() and not re.match(
r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
cleaned_lines.append(line.strip())
# Join the cleaned lines
cleaned_text = ' '.join(cleaned_lines)
# Remove any extra spaces
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
return cleaned_text

View File

@@ -0,0 +1,98 @@
# transcription_processor.py
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.utils.model_utils import create_language_template
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
class TranscriptionBaseProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.annotation_chunk_size = model_variables.annotation_chunk_length
self.annotation_chunk_overlap = 0
def process(self):
self._log("Starting Transcription processing")
try:
with current_event.create_span("Transcription Generation"):
transcription = self._get_transcription()
with current_event.create_span("Markdown Generation"):
chunks = self._chunk_transcription(transcription)
markdown_chunks = self._process_chunks(chunks)
full_markdown = self._combine_markdown_chunks(markdown_chunks)
self._save_markdown(full_markdown)
self._log("Finished processing Transcription")
return full_markdown, self._extract_title_from_markdown(full_markdown)
except Exception as e:
self._log(f"Error processing Transcription: {str(e)}", level='error')
raise
def _get_transcription(self):
# This method should be implemented by child classes
raise NotImplementedError
def _chunk_transcription(self, transcription):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.annotation_chunk_size,
chunk_overlap=self.annotation_chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
return text_splitter.split_text(transcription)
def _process_chunks(self, chunks):
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
llm = self.model_variables.get_llm()
template = self.model_variables.get_template('transcript')
language_template = create_language_template(template, self.document_version.language)
transcript_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | transcript_prompt | llm | output_parser
markdown_chunks = []
previous_part = ""
for i, chunk in enumerate(chunks):
input_transcript = {
'transcript': chunk,
'previous_part': previous_part
}
markdown = chain.invoke(input_transcript)
markdown = self._clean_markdown(markdown)
self._log_tuning("_process_chunks", {
"Chunk Number": f"{i + 1} of {len(chunks)}",
"Chunk": chunk,
"Previous Chunk": previous_part,
"Markdown": markdown,
})
markdown_chunks.append(markdown)
# Extract the last part for the next iteration
lines = markdown.split('\n')
last_header = None
for line in reversed(lines):
if line.startswith('#'):
last_header = line
break
if last_header:
header_index = lines.index(last_header)
previous_part = '\n'.join(lines[header_index:])
else:
previous_part = lines[-1] if lines else ""
return markdown_chunks
def _combine_markdown_chunks(self, markdown_chunks):
return "\n\n".join(markdown_chunks)
def _extract_title_from_markdown(self, markdown):
lines = markdown.split('\n')
for line in lines:
if line.startswith('# '):
return line[2:].strip()
return "Untitled Transcription"