import io import os from pydub import AudioSegment import tempfile from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from langchain_core.runnables import RunnablePassthrough from common.extensions import minio_client from common.utils.model_utils import create_language_template from .processor import Processor import subprocess class AudioProcessor(Processor): def __init__(self, tenant, model_variables, document_version): super().__init__(tenant, model_variables, document_version) self.transcription_client = model_variables['transcription_client'] self.transcription_model = model_variables['transcription_model'] self.ffmpeg_path = 'ffmpeg' def process(self): self._log("Starting Audio processing") try: file_data = minio_client.download_document_file( self.tenant.id, self.document_version.doc_id, self.document_version.language, self.document_version.id, self.document_version.file_name ) compressed_audio = self._compress_audio(file_data) transcription = self._transcribe_audio(compressed_audio) markdown, title = self._generate_markdown_from_transcription(transcription) self._save_markdown(markdown) self._log("Finished processing Audio") return markdown, title except Exception as e: self._log(f"Error processing Audio: {str(e)}", level='error') raise def _compress_audio(self, audio_data): self._log("Compressing audio") with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input: temp_input.write(audio_data) temp_input.flush() # Use a unique filename for the output to avoid conflicts output_filename = f'compressed_{os.urandom(8).hex()}.mp3' output_path = os.path.join(tempfile.gettempdir(), output_filename) try: result = subprocess.run( [self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path], capture_output=True, text=True, check=True ) with open(output_path, 'rb') as f: compressed_data = f.read() # Save compressed audio to MinIO compressed_filename = f"{self.document_version.id}_compressed.mp3" minio_client.upload_document_file( self.tenant.id, self.document_version.doc_id, self.document_version.language, self.document_version.id, compressed_filename, compressed_data ) self._log(f"Saved compressed audio to MinIO: {compressed_filename}") return compressed_data except subprocess.CalledProcessError as e: error_message = f"Compression failed: {e.stderr}" self._log(error_message, level='error') raise Exception(error_message) finally: # Clean up temporary files os.unlink(temp_input.name) if os.path.exists(output_path): os.unlink(output_path) def _transcribe_audio(self, audio_data): self._log("Starting audio transcription") audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3") segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds transcriptions = [] for i, chunk in enumerate(audio[::segment_length]): self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}') with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio: chunk.export(temp_audio.name, format="mp3") temp_audio.flush() try: file_size = os.path.getsize(temp_audio.name) self._log(f"Temporary audio file size: {file_size} bytes") with open(temp_audio.name, 'rb') as audio_file: file_start = audio_file.read(100) self._log(f"First 100 bytes of audio file: {file_start}") audio_file.seek(0) # Reset file pointer to the beginning self._log("Calling transcription API") transcription = self.transcription_client.audio.transcriptions.create( file=audio_file, model=self.transcription_model, language=self.document_version.language, response_format='verbose_json', ) self._log("Transcription API call completed") if transcription: # Handle the transcription result based on its type if isinstance(transcription, str): self._log(f"Transcription result (string): {transcription[:100]}...") transcriptions.append(transcription) elif hasattr(transcription, 'text'): self._log( f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...") transcriptions.append(transcription.text) else: self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...") transcriptions.append(str(transcription)) else: self._log("Warning: Received empty transcription", level='warning') except Exception as e: self._log(f"Error during transcription: {str(e)}", level='error') finally: os.unlink(temp_audio.name) full_transcription = " ".join(filter(None, transcriptions)) if not full_transcription: self._log("Warning: No transcription was generated", level='warning') full_transcription = "No transcription available." # Save transcription to MinIO transcription_filename = f"{self.document_version.id}_transcription.txt" minio_client.upload_document_file( self.tenant.id, self.document_version.doc_id, self.document_version.language, self.document_version.id, transcription_filename, full_transcription.encode('utf-8') ) self._log(f"Saved transcription to MinIO: {transcription_filename}") return full_transcription def _generate_markdown_from_transcription(self, transcription): self._log("Generating markdown from transcription") llm = self.model_variables['llm'] template = self.model_variables['transcript_template'] language_template = create_language_template(template, self.document_version.language) transcript_prompt = ChatPromptTemplate.from_template(language_template) setup = RunnablePassthrough() output_parser = StrOutputParser() chain = setup | transcript_prompt | llm | output_parser input_transcript = {'transcript': transcription} markdown = chain.invoke(input_transcript) # Extract title from the markdown title = self._extract_title_from_markdown(markdown) return markdown, title def _extract_title_from_markdown(self, markdown): # Simple extraction of the first header as the title lines = markdown.split('\n') for line in lines: if line.startswith('# '): return line[2:].strip() return "Untitled Audio Transcription"