- Introduction of dynamic Retrievers & Specialists
- Introduction of dynamic Processors - Introduction of caching system - Introduction of a better template manager - Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists - Start adaptation of chat client
This commit is contained in:
32
eveai_workers/processors/srt_processor.py
Normal file
32
eveai_workers/processors/srt_processor.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from common.extensions import minio_client
|
||||
from .transcription_processor import TranscriptionBaseProcessor
|
||||
import re
|
||||
|
||||
|
||||
class SRTProcessor(TranscriptionBaseProcessor):
|
||||
def _get_transcription(self):
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
srt_content = file_data.decode('utf-8')
|
||||
return self._clean_srt(srt_content)
|
||||
|
||||
def _clean_srt(self, srt_content):
|
||||
# Remove timecodes and subtitle numbers
|
||||
cleaned_lines = []
|
||||
for line in srt_content.split('\n'):
|
||||
# Skip empty lines, subtitle numbers, and timecodes
|
||||
if line.strip() and not line.strip().isdigit() and not re.match(
|
||||
r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
|
||||
cleaned_lines.append(line.strip())
|
||||
|
||||
# Join the cleaned lines
|
||||
cleaned_text = ' '.join(cleaned_lines)
|
||||
|
||||
# Remove any extra spaces
|
||||
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
||||
|
||||
return cleaned_text
|
||||
|
||||
Reference in New Issue
Block a user