eveAI/eveai_workers/Processors/audio_processor.py

import io
import os
from pydub import AudioSegment
import tempfile
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .processor import Processor
import subprocess


class AudioProcessor(Processor):
    def __init__(self, tenant, model_variables, document_version):
        super().__init__(tenant, model_variables, document_version)
        self.transcription_client = model_variables['transcription_client']
        self.transcription_model = model_variables['transcription_model']
        self.ffmpeg_path = 'ffmpeg'


    def process(self):
        self._log("Starting Audio processing")
        try:
            file_data = minio_client.download_document_file(
                self.tenant.id,
                self.document_version.doc_id,
                self.document_version.language,
                self.document_version.id,
                self.document_version.file_name
            )

            compressed_audio = self._compress_audio(file_data)
            transcription = self._transcribe_audio(compressed_audio)
            markdown, title = self._generate_markdown_from_transcription(transcription)

            self._save_markdown(markdown)
            self._log("Finished processing Audio")
            return markdown, title
        except Exception as e:
            self._log(f"Error processing Audio: {str(e)}", level='error')
            raise

    def _compress_audio(self, audio_data):
        self._log("Compressing audio")
        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input:
            temp_input.write(audio_data)
            temp_input.flush()

            # Use a unique filename for the output to avoid conflicts
            output_filename = f'compressed_{os.urandom(8).hex()}.mp3'
            output_path = os.path.join(tempfile.gettempdir(), output_filename)

            try:
                result = subprocess.run(
                    [self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path],
                    capture_output=True,
                    text=True,
                    check=True
                )

                with open(output_path, 'rb') as f:
                    compressed_data = f.read()

                # Save compressed audio to MinIO
                compressed_filename = f"{self.document_version.id}_compressed.mp3"
                minio_client.upload_document_file(
                    self.tenant.id,
                    self.document_version.doc_id,
                    self.document_version.language,
                    self.document_version.id,
                    compressed_filename,
                    compressed_data
                )
                self._log(f"Saved compressed audio to MinIO: {compressed_filename}")

                return compressed_data

            except subprocess.CalledProcessError as e:
                error_message = f"Compression failed: {e.stderr}"
                self._log(error_message, level='error')
                raise Exception(error_message)

            finally:
                # Clean up temporary files
                os.unlink(temp_input.name)
                if os.path.exists(output_path):
                    os.unlink(output_path)

    def _transcribe_audio(self, audio_data):
        self._log("Starting audio transcription")
        audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")

        segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
        transcriptions = []

        for i, chunk in enumerate(audio[::segment_length]):
            self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}')

            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
                chunk.export(temp_audio.name, format="mp3")
                temp_audio.flush()

                try:
                    file_size = os.path.getsize(temp_audio.name)
                    self._log(f"Temporary audio file size: {file_size} bytes")

                    with open(temp_audio.name, 'rb') as audio_file:
                        file_start = audio_file.read(100)
                        self._log(f"First 100 bytes of audio file: {file_start}")
                        audio_file.seek(0)  # Reset file pointer to the beginning

                        self._log("Calling transcription API")
                        transcription = self.transcription_client.audio.transcriptions.create(
                            file=audio_file,
                            model=self.transcription_model,
                            language=self.document_version.language,
                            response_format='verbose_json',
                        )
                        self._log("Transcription API call completed")

                    if transcription:
                        # Handle the transcription result based on its type
                        if isinstance(transcription, str):
                            self._log(f"Transcription result (string): {transcription[:100]}...")
                            transcriptions.append(transcription)
                        elif hasattr(transcription, 'text'):
                            self._log(
                                f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
                            transcriptions.append(transcription.text)
                        else:
                            self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
                            transcriptions.append(str(transcription))
                    else:
                        self._log("Warning: Received empty transcription", level='warning')

                except Exception as e:
                    self._log(f"Error during transcription: {str(e)}", level='error')
                finally:
                    os.unlink(temp_audio.name)

        full_transcription = " ".join(filter(None, transcriptions))

        if not full_transcription:
            self._log("Warning: No transcription was generated", level='warning')
            full_transcription = "No transcription available."

        # Save transcription to MinIO
        transcription_filename = f"{self.document_version.id}_transcription.txt"
        minio_client.upload_document_file(
            self.tenant.id,
            self.document_version.doc_id,
            self.document_version.language,
            self.document_version.id,
            transcription_filename,
            full_transcription.encode('utf-8')
        )
        self._log(f"Saved transcription to MinIO: {transcription_filename}")

        return full_transcription

    def _generate_markdown_from_transcription(self, transcription):
        self._log("Generating markdown from transcription")
        llm = self.model_variables['llm']
        template = self.model_variables['transcript_template']
        language_template = create_language_template(template, self.document_version.language)
        transcript_prompt = ChatPromptTemplate.from_template(language_template)
        setup = RunnablePassthrough()
        output_parser = StrOutputParser()

        chain = setup | transcript_prompt | llm | output_parser

        input_transcript = {'transcript': transcription}
        markdown = chain.invoke(input_transcript)

        # Extract title from the markdown
        title = self._extract_title_from_markdown(markdown)

        return markdown, title

    def _extract_title_from_markdown(self, markdown):
        # Simple extraction of the first header as the title
        lines = markdown.split('\n')
        for line in lines:
            if line.startswith('# '):
                return line[2:].strip()
        return "Untitled Audio Transcription"