- Improvements on document uploads (accept other files than html-files when entering a URL)

- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api. - Improvements on document processing - introduction of processor classes to streamline document inputs - Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
2024-09-02 12:37:44 +02:00
parent a158655247
commit 914c265afe
21 changed files with 1425 additions and 852 deletions
--- a/eveai_workers/Processors/audio_processor.py
+++ b/eveai_workers/Processors/audio_processor.py
@@ -0,0 +1,187 @@
+import io
+import os
+from pydub import AudioSegment
+import tempfile
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from common.extensions import minio_client
+from common.utils.model_utils import create_language_template
+from .processor import Processor
+import subprocess
+
+
+class AudioProcessor(Processor):
+    def __init__(self, tenant, model_variables, document_version):
+        super().__init__(tenant, model_variables, document_version)
+        self.transcription_client = model_variables['transcription_client']
+        self.transcription_model = model_variables['transcription_model']
+        self.ffmpeg_path = 'ffmpeg'
+
+
+    def process(self):
+        self._log("Starting Audio processing")
+        try:
+            file_data = minio_client.download_document_file(
+                self.tenant.id,
+                self.document_version.doc_id,
+                self.document_version.language,
+                self.document_version.id,
+                self.document_version.file_name
+            )
+
+            compressed_audio = self._compress_audio(file_data)
+            transcription = self._transcribe_audio(compressed_audio)
+            markdown, title = self._generate_markdown_from_transcription(transcription)
+
+            self._save_markdown(markdown)
+            self._log("Finished processing Audio")
+            return markdown, title
+        except Exception as e:
+            self._log(f"Error processing Audio: {str(e)}", level='error')
+            raise
+
+    def _compress_audio(self, audio_data):
+        self._log("Compressing audio")
+        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input:
+            temp_input.write(audio_data)
+            temp_input.flush()
+
+            # Use a unique filename for the output to avoid conflicts
+            output_filename = f'compressed_{os.urandom(8).hex()}.mp3'
+            output_path = os.path.join(tempfile.gettempdir(), output_filename)
+
+            try:
+                result = subprocess.run(
+                    [self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path],
+                    capture_output=True,
+                    text=True,
+                    check=True
+                )
+
+                with open(output_path, 'rb') as f:
+                    compressed_data = f.read()
+
+                # Save compressed audio to MinIO
+                compressed_filename = f"{self.document_version.id}_compressed.mp3"
+                minio_client.upload_document_file(
+                    self.tenant.id,
+                    self.document_version.doc_id,
+                    self.document_version.language,
+                    self.document_version.id,
+                    compressed_filename,
+                    compressed_data
+                )
+                self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
+
+                return compressed_data
+
+            except subprocess.CalledProcessError as e:
+                error_message = f"Compression failed: {e.stderr}"
+                self._log(error_message, level='error')
+                raise Exception(error_message)
+
+            finally:
+                # Clean up temporary files
+                os.unlink(temp_input.name)
+                if os.path.exists(output_path):
+                    os.unlink(output_path)
+
+    def _transcribe_audio(self, audio_data):
+        self._log("Starting audio transcription")
+        audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
+
+        segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
+        transcriptions = []
+
+        for i, chunk in enumerate(audio[::segment_length]):
+            self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}')
+
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
+                chunk.export(temp_audio.name, format="mp3")
+                temp_audio.flush()
+
+                try:
+                    file_size = os.path.getsize(temp_audio.name)
+                    self._log(f"Temporary audio file size: {file_size} bytes")
+
+                    with open(temp_audio.name, 'rb') as audio_file:
+                        file_start = audio_file.read(100)
+                        self._log(f"First 100 bytes of audio file: {file_start}")
+                        audio_file.seek(0)  # Reset file pointer to the beginning
+
+                        self._log("Calling transcription API")
+                        transcription = self.transcription_client.audio.transcriptions.create(
+                            file=audio_file,
+                            model=self.transcription_model,
+                            language=self.document_version.language,
+                            response_format='verbose_json',
+                        )
+                        self._log("Transcription API call completed")
+
+                    if transcription:
+                        # Handle the transcription result based on its type
+                        if isinstance(transcription, str):
+                            self._log(f"Transcription result (string): {transcription[:100]}...")
+                            transcriptions.append(transcription)
+                        elif hasattr(transcription, 'text'):
+                            self._log(
+                                f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
+                            transcriptions.append(transcription.text)
+                        else:
+                            self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
+                            transcriptions.append(str(transcription))
+                    else:
+                        self._log("Warning: Received empty transcription", level='warning')
+
+                except Exception as e:
+                    self._log(f"Error during transcription: {str(e)}", level='error')
+                finally:
+                    os.unlink(temp_audio.name)
+
+        full_transcription = " ".join(filter(None, transcriptions))
+
+        if not full_transcription:
+            self._log("Warning: No transcription was generated", level='warning')
+            full_transcription = "No transcription available."
+
+        # Save transcription to MinIO
+        transcription_filename = f"{self.document_version.id}_transcription.txt"
+        minio_client.upload_document_file(
+            self.tenant.id,
+            self.document_version.doc_id,
+            self.document_version.language,
+            self.document_version.id,
+            transcription_filename,
+            full_transcription.encode('utf-8')
+        )
+        self._log(f"Saved transcription to MinIO: {transcription_filename}")
+
+        return full_transcription
+
+    def _generate_markdown_from_transcription(self, transcription):
+        self._log("Generating markdown from transcription")
+        llm = self.model_variables['llm']
+        template = self.model_variables['transcript_template']
+        language_template = create_language_template(template, self.document_version.language)
+        transcript_prompt = ChatPromptTemplate.from_template(language_template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+
+        chain = setup | transcript_prompt | llm | output_parser
+
+        input_transcript = {'transcript': transcription}
+        markdown = chain.invoke(input_transcript)
+
+        # Extract title from the markdown
+        title = self._extract_title_from_markdown(markdown)
+
+        return markdown, title
+
+    def _extract_title_from_markdown(self, markdown):
+        # Simple extraction of the first header as the title
+        lines = markdown.split('\n')
+        for line in lines:
+            if line.startswith('# '):
+                return line[2:].strip()
+        return "Untitled Audio Transcription"