eveAI/eveai_workers/Processors/srt_processor.py

from common.extensions import minio_client
from .transcription_processor import TranscriptionProcessor
import re


class SRTProcessor(TranscriptionProcessor):
    def _get_transcription(self):
        file_data = minio_client.download_document_file(
            self.tenant.id,
            self.document_version.bucket_name,
            self.document_version.object_name,
        )
        srt_content = file_data.decode('utf-8')
        return self._clean_srt(srt_content)

    def _clean_srt(self, srt_content):
        # Remove timecodes and subtitle numbers
        cleaned_lines = []
        for line in srt_content.split('\n'):
            # Skip empty lines, subtitle numbers, and timecodes
            if line.strip() and not line.strip().isdigit() and not re.match(
                    r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
                cleaned_lines.append(line.strip())

        # Join the cleaned lines
        cleaned_text = ' '.join(cleaned_lines)

        # Remove any extra spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

        return cleaned_text