- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api. - Improvements on document processing - introduction of processor classes to streamline document inputs - Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
188 lines
7.9 KiB
Python
188 lines
7.9 KiB
Python
import io
|
|
import os
|
|
from pydub import AudioSegment
|
|
import tempfile
|
|
from langchain_core.output_parsers import StrOutputParser
|
|
from langchain_core.prompts import ChatPromptTemplate
|
|
from langchain_core.runnables import RunnablePassthrough
|
|
from common.extensions import minio_client
|
|
from common.utils.model_utils import create_language_template
|
|
from .processor import Processor
|
|
import subprocess
|
|
|
|
|
|
class AudioProcessor(Processor):
|
|
def __init__(self, tenant, model_variables, document_version):
|
|
super().__init__(tenant, model_variables, document_version)
|
|
self.transcription_client = model_variables['transcription_client']
|
|
self.transcription_model = model_variables['transcription_model']
|
|
self.ffmpeg_path = 'ffmpeg'
|
|
|
|
|
|
def process(self):
|
|
self._log("Starting Audio processing")
|
|
try:
|
|
file_data = minio_client.download_document_file(
|
|
self.tenant.id,
|
|
self.document_version.doc_id,
|
|
self.document_version.language,
|
|
self.document_version.id,
|
|
self.document_version.file_name
|
|
)
|
|
|
|
compressed_audio = self._compress_audio(file_data)
|
|
transcription = self._transcribe_audio(compressed_audio)
|
|
markdown, title = self._generate_markdown_from_transcription(transcription)
|
|
|
|
self._save_markdown(markdown)
|
|
self._log("Finished processing Audio")
|
|
return markdown, title
|
|
except Exception as e:
|
|
self._log(f"Error processing Audio: {str(e)}", level='error')
|
|
raise
|
|
|
|
def _compress_audio(self, audio_data):
|
|
self._log("Compressing audio")
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input:
|
|
temp_input.write(audio_data)
|
|
temp_input.flush()
|
|
|
|
# Use a unique filename for the output to avoid conflicts
|
|
output_filename = f'compressed_{os.urandom(8).hex()}.mp3'
|
|
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True
|
|
)
|
|
|
|
with open(output_path, 'rb') as f:
|
|
compressed_data = f.read()
|
|
|
|
# Save compressed audio to MinIO
|
|
compressed_filename = f"{self.document_version.id}_compressed.mp3"
|
|
minio_client.upload_document_file(
|
|
self.tenant.id,
|
|
self.document_version.doc_id,
|
|
self.document_version.language,
|
|
self.document_version.id,
|
|
compressed_filename,
|
|
compressed_data
|
|
)
|
|
self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
|
|
|
|
return compressed_data
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
error_message = f"Compression failed: {e.stderr}"
|
|
self._log(error_message, level='error')
|
|
raise Exception(error_message)
|
|
|
|
finally:
|
|
# Clean up temporary files
|
|
os.unlink(temp_input.name)
|
|
if os.path.exists(output_path):
|
|
os.unlink(output_path)
|
|
|
|
def _transcribe_audio(self, audio_data):
|
|
self._log("Starting audio transcription")
|
|
audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
|
|
|
segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
|
|
transcriptions = []
|
|
|
|
for i, chunk in enumerate(audio[::segment_length]):
|
|
self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}')
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
|
|
chunk.export(temp_audio.name, format="mp3")
|
|
temp_audio.flush()
|
|
|
|
try:
|
|
file_size = os.path.getsize(temp_audio.name)
|
|
self._log(f"Temporary audio file size: {file_size} bytes")
|
|
|
|
with open(temp_audio.name, 'rb') as audio_file:
|
|
file_start = audio_file.read(100)
|
|
self._log(f"First 100 bytes of audio file: {file_start}")
|
|
audio_file.seek(0) # Reset file pointer to the beginning
|
|
|
|
self._log("Calling transcription API")
|
|
transcription = self.transcription_client.audio.transcriptions.create(
|
|
file=audio_file,
|
|
model=self.transcription_model,
|
|
language=self.document_version.language,
|
|
response_format='verbose_json',
|
|
)
|
|
self._log("Transcription API call completed")
|
|
|
|
if transcription:
|
|
# Handle the transcription result based on its type
|
|
if isinstance(transcription, str):
|
|
self._log(f"Transcription result (string): {transcription[:100]}...")
|
|
transcriptions.append(transcription)
|
|
elif hasattr(transcription, 'text'):
|
|
self._log(
|
|
f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
|
|
transcriptions.append(transcription.text)
|
|
else:
|
|
self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
|
|
transcriptions.append(str(transcription))
|
|
else:
|
|
self._log("Warning: Received empty transcription", level='warning')
|
|
|
|
except Exception as e:
|
|
self._log(f"Error during transcription: {str(e)}", level='error')
|
|
finally:
|
|
os.unlink(temp_audio.name)
|
|
|
|
full_transcription = " ".join(filter(None, transcriptions))
|
|
|
|
if not full_transcription:
|
|
self._log("Warning: No transcription was generated", level='warning')
|
|
full_transcription = "No transcription available."
|
|
|
|
# Save transcription to MinIO
|
|
transcription_filename = f"{self.document_version.id}_transcription.txt"
|
|
minio_client.upload_document_file(
|
|
self.tenant.id,
|
|
self.document_version.doc_id,
|
|
self.document_version.language,
|
|
self.document_version.id,
|
|
transcription_filename,
|
|
full_transcription.encode('utf-8')
|
|
)
|
|
self._log(f"Saved transcription to MinIO: {transcription_filename}")
|
|
|
|
return full_transcription
|
|
|
|
def _generate_markdown_from_transcription(self, transcription):
|
|
self._log("Generating markdown from transcription")
|
|
llm = self.model_variables['llm']
|
|
template = self.model_variables['transcript_template']
|
|
language_template = create_language_template(template, self.document_version.language)
|
|
transcript_prompt = ChatPromptTemplate.from_template(language_template)
|
|
setup = RunnablePassthrough()
|
|
output_parser = StrOutputParser()
|
|
|
|
chain = setup | transcript_prompt | llm | output_parser
|
|
|
|
input_transcript = {'transcript': transcription}
|
|
markdown = chain.invoke(input_transcript)
|
|
|
|
# Extract title from the markdown
|
|
title = self._extract_title_from_markdown(markdown)
|
|
|
|
return markdown, title
|
|
|
|
def _extract_title_from_markdown(self, markdown):
|
|
# Simple extraction of the first header as the title
|
|
lines = markdown.split('\n')
|
|
for line in lines:
|
|
if line.startswith('# '):
|
|
return line[2:].strip()
|
|
return "Untitled Audio Transcription"
|