from common.extensions import minio_client from .transcription_processor import TranscriptionBaseProcessor import re class SRTProcessor(TranscriptionBaseProcessor): def _get_transcription(self): file_data = minio_client.download_document_file( self.tenant.id, self.document_version.bucket_name, self.document_version.object_name, ) srt_content = file_data.decode('utf-8') return self._clean_srt(srt_content) def _clean_srt(self, srt_content): # Remove timecodes and subtitle numbers cleaned_lines = [] for line in srt_content.split('\n'): # Skip empty lines, subtitle numbers, and timecodes if line.strip() and not line.strip().isdigit() and not re.match( r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line): cleaned_lines.append(line.strip()) # Join the cleaned lines cleaned_text = ' '.join(cleaned_lines) # Remove any extra spaces cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() return cleaned_text