- Improvements on audio processing to limit CPU and memory usage

- Removed Portkey from the equation, and defined explicit monitoring using Langchain native code - Optimization of Business Event logging
2024-10-02 14:11:46 +02:00
parent 883175b8f5
commit b700cfac64
13 changed files with 450 additions and 228 deletions
--- a/eveai_workers/Processors/audio_processor.py
+++ b/eveai_workers/Processors/audio_processor.py
@@ -1,6 +1,8 @@
 import io
 import os
+import time

+import psutil
 from pydub import AudioSegment
 import tempfile
 from common.extensions import minio_client
@@ -16,6 +18,11 @@ class AudioProcessor(TranscriptionProcessor):
        self.transcription_client = model_variables['transcription_client']
        self.transcription_model = model_variables['transcription_model']
        self.ffmpeg_path = 'ffmpeg'
+        self.max_compression_duration = model_variables['max_compression_duration']
+        self.max_transcription_duration = model_variables['max_transcription_duration']
+        self.compression_cpu_limit = model_variables.get('compression_cpu_limit', 50)  # CPU usage limit in percentage
+        self.compression_process_delay = model_variables.get('compression_process_delay', 0.1)  # Delay between processing chunks in seconds
+        self.file_type = document_version.file_type

    def _get_transcription(self):
        file_data = minio_client.download_document_file(
@@ -26,68 +33,121 @@ class AudioProcessor(TranscriptionProcessor):
            self.document_version.file_name
        )

-        with current_event.create_span("Audio Processing"):
+        with current_event.create_span("Audio Compression"):
            compressed_audio = self._compress_audio(file_data)
-        with current_event.create_span("Transcription Generation"):
+        with current_event.create_span("Audio Transcription"):
            transcription = self._transcribe_audio(compressed_audio)

        return transcription

    def _compress_audio(self, audio_data):
        self._log("Compressing audio")
-        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input:
-            temp_input.write(audio_data)
-            temp_input.flush()

-            # Use a unique filename for the output to avoid conflicts
-            output_filename = f'compressed_{os.urandom(8).hex()}.mp3'
-            output_path = os.path.join(tempfile.gettempdir(), output_filename)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_file:
+            temp_file.write(audio_data)
+            temp_file_path = temp_file.name

-            try:
-                result = subprocess.run(
-                    [self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path],
-                    capture_output=True,
-                    text=True,
-                    check=True
+        try:
+            self._log("Creating AudioSegment from file")
+            audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
+            self._log("Finished creating AudioSegment from file")
+            total_duration = len(audio_info)
+            self._log(f"Audio duration: {total_duration / 1000} seconds")
+
+            segment_length = self.max_compression_duration * 1000  # Convert to milliseconds
+            total_chunks = (total_duration + segment_length - 1) // segment_length
+
+            compressed_segments = AudioSegment.empty()
+
+            for i in range(total_chunks):
+                self._log(f"Compressing segment {i + 1} of {total_chunks}")
+
+                start_time = i * segment_length
+                end_time = min((i + 1) * segment_length, total_duration)
+
+                chunk = AudioSegment.from_file(
+                    temp_file_path,
+                    format=self.document_version.file_type,
+                    start_second=start_time / 1000,
+                    duration=(end_time - start_time) / 1000
                )

-                with open(output_path, 'rb') as f:
-                    compressed_data = f.read()
+                compressed_chunk = self._compress_segment(chunk)
+                compressed_segments += compressed_chunk

-                # Save compressed audio to MinIO
-                compressed_filename = f"{self.document_version.id}_compressed.mp3"
+                time.sleep(self.compression_process_delay)
+
+            # Save compressed audio to MinIO
+            compressed_filename = f"{self.document_version.id}_compressed.mp3"
+            with io.BytesIO() as compressed_buffer:
+                compressed_segments.export(compressed_buffer, format="mp3")
+                compressed_buffer.seek(0)
                minio_client.upload_document_file(
                    self.tenant.id,
                    self.document_version.doc_id,
                    self.document_version.language,
                    self.document_version.id,
                    compressed_filename,
-                    compressed_data
+                    compressed_buffer.read()
                )
-                self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
+            self._log(f"Saved compressed audio to MinIO: {compressed_filename}")

-                return compressed_data
+            return compressed_segments

-            except subprocess.CalledProcessError as e:
-                error_message = f"Compression failed: {e.stderr}"
-                self._log(error_message, level='error')
-                raise Exception(error_message)
+        except Exception as e:
+            self._log(f"Error during audio processing: {str(e)}", level='error')
+            raise
+        finally:
+            os.unlink(temp_file_path)  # Ensure the temporary file is deleted

-            finally:
-                # Clean up temporary files
-                os.unlink(temp_input.name)
-                if os.path.exists(output_path):
-                    os.unlink(output_path)
+    def _compress_segment(self, audio_segment):
+        with io.BytesIO() as segment_buffer:
+            audio_segment.export(segment_buffer, format="wav")
+            segment_buffer.seek(0)
+
+            with io.BytesIO() as output_buffer:
+                command = [
+                    'nice', '-n', '19',
+                    'ffmpeg',
+                    '-i', 'pipe:0',
+                    '-ar', '16000',
+                    '-ac', '1',
+                    '-b:a', '32k',
+                    '-filter:a', 'loudnorm',
+                    '-f', 'mp3',
+                    'pipe:1'
+                ]
+
+                process = psutil.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+                stdout, stderr = process.communicate(input=segment_buffer.read())
+
+                if process.returncode != 0:
+                    self._log(f"FFmpeg error: {stderr.decode()}", level='error')
+                    raise Exception("FFmpeg compression failed")
+
+                output_buffer.write(stdout)
+                output_buffer.seek(0)
+                compressed_segment = AudioSegment.from_mp3(output_buffer)
+
+        return compressed_segment

    def _transcribe_audio(self, audio_data):
        self._log("Starting audio transcription")
-        audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
+        # audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
+        audio = audio_data

-        segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
+        segment_length = self.max_transcription_duration * 1000  # calculate milliseconds
        transcriptions = []
+        total_chunks = len(audio) // segment_length + 1

        for i, chunk in enumerate(audio[::segment_length]):
-            self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}')
+            self._log(f'Processing chunk {i + 1} of {total_chunks}')
+            segment_duration = 0
+            if i == total_chunks - 1:
+                segment_duration = (len(audio) % segment_length) // 1000
+            else:
+                segment_duration = self.max_transcription_duration

            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
                chunk.export(temp_audio.name, format="mp3")
@@ -103,11 +163,12 @@ class AudioProcessor(TranscriptionProcessor):
                        audio_file.seek(0)  # Reset file pointer to the beginning

                        self._log("Calling transcription API")
-                        transcription = self.transcription_client.audio.transcriptions.create(
+                        transcription = self.model_variables.transcribe(
                            file=audio_file,
                            model=self.transcription_model,
                            language=self.document_version.language,
                            response_format='verbose_json',
+                            duration=segment_duration,
                        )
                        self._log("Transcription API call completed")

--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -171,10 +171,12 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):
                                         model_variables['max_chunk_size'])

    # Enrich chunks
-    enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
+    with current_event.create_span("Enrich Chunks"):
+        enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)

    # Create embeddings
-    embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
+    with current_event.create_span("Create Embeddings"):
+        embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)

    # Update document version and save embeddings
    try:
@@ -194,7 +196,6 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):


 def enrich_chunks(tenant, model_variables, document_version, title, chunks):
-    current_event.log("Starting Enriching Chunks Processing")
    current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
                             f'on document version {document_version.id}')

@@ -227,7 +228,6 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):

    current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
                             f'on document version {document_version.id}')
-    current_event.log("Finished Enriching Chunks Processing")

    return enriched_chunks

@@ -261,7 +261,6 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):


 def embed_chunks(tenant, model_variables, document_version, chunks):
-    current_event.log("Starting Embedding Chunks Processing")
    current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
                             f'on document version {document_version.id}')
    embedding_model = model_variables['embedding_model']