import io import os from pydub import AudioSegment import tempfile from common.extensions import minio_client import subprocess from .transcription_processor import TranscriptionProcessor class AudioProcessor(TranscriptionProcessor): def __init__(self, tenant, model_variables, document_version): super().__init__(tenant, model_variables, document_version) self.transcription_client = model_variables['transcription_client'] self.transcription_model = model_variables['transcription_model'] self.ffmpeg_path = 'ffmpeg' def _get_transcription(self): file_data = minio_client.download_document_file( self.tenant.id, self.document_version.doc_id, self.document_version.language, self.document_version.id, self.document_version.file_name ) compressed_audio = self._compress_audio(file_data) return self._transcribe_audio(compressed_audio) def _compress_audio(self, audio_data): self._log("Compressing audio") with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input: temp_input.write(audio_data) temp_input.flush() # Use a unique filename for the output to avoid conflicts output_filename = f'compressed_{os.urandom(8).hex()}.mp3' output_path = os.path.join(tempfile.gettempdir(), output_filename) try: result = subprocess.run( [self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path], capture_output=True, text=True, check=True ) with open(output_path, 'rb') as f: compressed_data = f.read() # Save compressed audio to MinIO compressed_filename = f"{self.document_version.id}_compressed.mp3" minio_client.upload_document_file( self.tenant.id, self.document_version.doc_id, self.document_version.language, self.document_version.id, compressed_filename, compressed_data ) self._log(f"Saved compressed audio to MinIO: {compressed_filename}") return compressed_data except subprocess.CalledProcessError as e: error_message = f"Compression failed: {e.stderr}" self._log(error_message, level='error') raise Exception(error_message) finally: # Clean up temporary files os.unlink(temp_input.name) if os.path.exists(output_path): os.unlink(output_path) def _transcribe_audio(self, audio_data): self._log("Starting audio transcription") audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3") segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds transcriptions = [] for i, chunk in enumerate(audio[::segment_length]): self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}') with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio: chunk.export(temp_audio.name, format="mp3") temp_audio.flush() try: file_size = os.path.getsize(temp_audio.name) self._log(f"Temporary audio file size: {file_size} bytes") with open(temp_audio.name, 'rb') as audio_file: file_start = audio_file.read(100) self._log(f"First 100 bytes of audio file: {file_start}") audio_file.seek(0) # Reset file pointer to the beginning self._log("Calling transcription API") transcription = self.transcription_client.audio.transcriptions.create( file=audio_file, model=self.transcription_model, language=self.document_version.language, response_format='verbose_json', ) self._log("Transcription API call completed") if transcription: # Handle the transcription result based on its type if isinstance(transcription, str): self._log(f"Transcription result (string): {transcription[:100]}...") transcriptions.append(transcription) elif hasattr(transcription, 'text'): self._log( f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...") transcriptions.append(transcription.text) else: self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...") transcriptions.append(str(transcription)) else: self._log("Warning: Received empty transcription", level='warning') except Exception as e: self._log(f"Error during transcription: {str(e)}", level='error') finally: os.unlink(temp_audio.name) full_transcription = " ".join(filter(None, transcriptions)) if not full_transcription: self._log("Warning: No transcription was generated", level='warning') full_transcription = "No transcription available." # Save transcription to MinIO transcription_filename = f"{self.document_version.id}_transcription.txt" minio_client.upload_document_file( self.tenant.id, self.document_version.doc_id, self.document_version.language, self.document_version.id, transcription_filename, full_transcription.encode('utf-8') ) self._log(f"Saved transcription to MinIO: {transcription_filename}") return full_transcription