Files
eveAI/eveai_workers/Processors/srt_processor.py
Josako 6cf660e622 - Adding a Tenant Type
- Allow filtering on Tenant Types & searching for parts of Tenant names
- Implement health checks
- Start Prometheus monitoring (needs to be finalized)
- Refine audio_processor and srt_processor to reduce duplicate code and support for larger files
- Introduce repopack to reason in LLMs about the code
2024-09-13 15:43:40 +02:00

35 lines
1.2 KiB
Python

from common.extensions import minio_client
from .transcription_processor import TranscriptionProcessor
import re
class SRTProcessor(TranscriptionProcessor):
def _get_transcription(self):
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
self.document_version.file_name
)
srt_content = file_data.decode('utf-8')
return self._clean_srt(srt_content)
def _clean_srt(self, srt_content):
# Remove timecodes and subtitle numbers
cleaned_lines = []
for line in srt_content.split('\n'):
# Skip empty lines, subtitle numbers, and timecodes
if line.strip() and not line.strip().isdigit() and not re.match(
r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
cleaned_lines.append(line.strip())
# Join the cleaned lines
cleaned_text = ' '.join(cleaned_lines)
# Remove any extra spaces
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
return cleaned_text