- Improvements to enable deployment in the cloud, mainly changing file access to Minio

- Improvements on RAG logging, and some debugging in that area
2024-08-01 17:35:54 +02:00
parent 88ca04136d
commit 64cf8df3a9
19 changed files with 617 additions and 206 deletions
--- a/eveai_workers/init.py
+++ b/eveai_workers/init.py
@@ -3,7 +3,7 @@ import logging.config
 from flask import Flask

 from common.utils.celery_utils import make_celery, init_celery
-from common.extensions import db
+from common.extensions import db, minio_client
 from config.logging_config import LOGGING


@@ -33,6 +33,7 @@ def create_app(config_file=None):

 def register_extensions(app):
    db.init_app(app)
+    minio_client.init_app(app)


 app, celery = create_app()
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -1,3 +1,4 @@
+import io
 import os
 from datetime import datetime as dt, timezone as tz
 import subprocess
@@ -21,7 +22,7 @@ import PyPDF2
 from pydub import AudioSegment
 import tempfile

-from common.extensions import db
+from common.extensions import db, minio_client
 from common.models.document import DocumentVersion, Embedding
 from common.models.user import Tenant
 from common.utils.celery_utils import current_celery
@@ -32,11 +33,6 @@ from common.utils.os_utils import safe_remove, sync_folder

@current_celery.task(name='create_embeddings', queue='embeddings')
 def create_embeddings(tenant_id, document_version_id):
-    # Setup Remote Debugging only if PYCHARM_DEBUG=True
-    if current_app.config['PYCHARM_DEBUG']:
-        import pydevd_pycharm
-        pydevd_pycharm.settrace('localhost', port=50170, stdoutToServer=True, stderrToServer=True)
-
    current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.')

    try:
@@ -50,6 +46,7 @@ def create_embeddings(tenant_id, document_version_id):

        # Select variables to work with depending on tenant and model
        model_variables = select_model_variables(tenant)
+        current_app.logger.debug(f'Model variables: {model_variables}')

        # Retrieve document version to process
        document_version = DocumentVersion.query.get(document_version_id)
@@ -107,33 +104,20 @@ def create_embeddings(tenant_id, document_version_id):


 def process_pdf(tenant, model_variables, document_version):
-    base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
-                             document_version.file_location)
-    file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
-                             document_version.file_location,
-                             document_version.file_name)
-    if os.path.exists(file_path):
-        pdf_text = ''
-        # Function to extract text from PDF and return as string
-        with open(file_path, 'rb') as file:
-            reader = PyPDF2.PdfReader(file)
-            for page_num in range(len(reader.pages)):
-                page = reader.pages[page_num]
-                pdf_text += page.extract_text()
-    else:
-        current_app.logger.error(f'The physical file for document version {document_version.id} '
-                                 f'for tenant {tenant.id} '
-                                 f'at {file_path} does not exist')
-        create_embeddings.update_state(state=states.FAILURE)
-        raise
+    file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                       document_version.id, document_version.file_name)
+
+    pdf_text = ''
+    pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
+    for page in pdf_reader.pages:
+        pdf_text += page.extract_text()

    markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
    markdown_file_name = f'{document_version.id}.md'
-    output_file = os.path.join(base_path, markdown_file_name)
-    with open(output_file, 'w') as f:
-        f.write(markdown)
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
+                         markdown_file_name, markdown.encode())

-    potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
+    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
                                         model_variables['max_chunk_size'])

@@ -175,43 +159,29 @@ def delete_embeddings_for_document_version(document_version):


 def process_html(tenant, model_variables, document_version):
+    file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                       document_version.id, document_version.file_name)
+    html_content = file_data.decode('utf-8')
+
    # The tags to be considered can be dependent on the tenant
    html_tags = model_variables['html_tags']
    html_end_tags = model_variables['html_end_tags']
    html_included_elements = model_variables['html_included_elements']
    html_excluded_elements = model_variables['html_excluded_elements']

-    base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
-                             document_version.file_location)
-
-    file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
-                             document_version.file_location,
-                             document_version.file_name)
-
-    if os.path.exists(file_path):
-        with open(file_path, 'rb') as f:
-            html_content = f.read()
-    else:
-        current_app.logger.error(f'The physical file for document version {document_version.id} '
-                                 f'for tenant {tenant.id} '
-                                 f'at {file_path} does not exist')
-        create_embeddings.update_state(state=states.FAILURE)
-        raise
-
    extracted_html, title = parse_html(tenant, html_content, html_tags, included_elements=html_included_elements,
                                       excluded_elements=html_excluded_elements)
+
    extracted_file_name = f'{document_version.id}-extracted.html'
-    output_file = os.path.join(base_path, extracted_file_name)
-    with open(output_file, 'w') as f:
-        f.write(extracted_html)
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
+                         extracted_file_name, extracted_html.encode())

    markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
    markdown_file_name = f'{document_version.id}.md'
-    output_file = os.path.join(base_path, markdown_file_name)
-    with open(output_file, 'w') as f:
-        f.write(markdown)
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
+                         markdown_file_name, markdown.encode())

-    potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
+    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
                                         model_variables['max_chunk_size'])

@@ -222,7 +192,7 @@ def process_html(tenant, model_variables, document_version):
    else:
        document_version.system_context = (f'Title: {title}\n')

-    enriched_chunks = enrich_chunks(tenant, document_version, chunks)
+    enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
    embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)

    try:
@@ -241,16 +211,17 @@ def process_html(tenant, model_variables, document_version):
                            f'on document version {document_version.id} :-)')


-def enrich_chunks(tenant, document_version, chunks):
+def enrich_chunks(tenant, document_version, title, chunks):
    current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
                             f'on document version {document_version.id}')
    current_app.logger.debug(f'Nr of chunks: {len(chunks)}')
    chunk_total_context = (f'Filename: {document_version.file_name}\n'
-                           f'User Context:{document_version.user_context}\n'
+                           f'User Context:\n{document_version.user_context}\n\n'
                           f'{document_version.system_context}\n\n')
    enriched_chunks = []
    initial_chunk = (f'Filename: {document_version.file_name}\n'
                     f'User Context:\n{document_version.user_context}\n\n'
+                     f'Title: {title}\n'
                     f'{chunks[0]}')

    enriched_chunks.append(initial_chunk)
@@ -311,7 +282,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
    text_to_summarize = doc_creator.create_documents(chunk)

    try:
-        summary = chain.run(text_to_summarize)
+        summary = chain.invoke({"text": text_to_summarize})
        current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
                                 f'on document version {document_version.id}.')
        return summary
@@ -391,23 +362,26 @@ def process_youtube(tenant, model_variables, document_version):
    markdown_file_name = f'{document_version.id}.md'

    # Remove existing files (in case of a re-processing of the file
-    safe_remove(os.path.join(base_path, download_file_name))
-    safe_remove(os.path.join(base_path, compressed_file_name))
-    safe_remove(os.path.join(base_path, transcription_file_name))
-    safe_remove(os.path.join(base_path, markdown_file_name))
-    sync_folder(base_path)
+    minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id, download_file_name)
+    minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id, compressed_file_name)
+    minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id, transcription_file_name)
+    minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id, markdown_file_name)

-    of, title, description, author = download_youtube(document_version.url, base_path, download_file_name, tenant)
+    of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
+                                                      download_file_name)
    document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
-    compress_audio(base_path, download_file_name, compressed_file_name, tenant)
-    transcribe_audio(base_path, compressed_file_name, transcription_file_name,
-                     document_version.language, tenant, model_variables)
-    annotate_transcription(base_path, transcription_file_name, markdown_file_name,
-                           document_version.language, tenant, model_variables)
+    compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
+    transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
+    annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)

-    potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
+    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
    actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
                                                model_variables['max_chunk_size'])
+
    enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
    embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)

@@ -427,83 +401,72 @@ def process_youtube(tenant, model_variables, document_version):
                            f'on Youtube document version {document_version.id} :-)')


-def download_youtube(url, file_location, file_name, tenant):
+def download_youtube(url, tenant_id, document_version, file_name):
    try:
-        current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
+        current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
        yt = YouTube(url)
        stream = yt.streams.get_audio_only()
-        output_file = stream.download(output_path=file_location, filename=file_name)
-        current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
-        return output_file, yt.title, yt.description, yt.author
+
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            stream.download(output_path=temp_file.name)
+            with open(temp_file.name, 'rb') as f:
+                file_data = f.read()
+
+        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
+                             file_name, file_data)
+
+        current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
+        return file_name, yt.title, yt.description, yt.author
    except Exception as e:
-        current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for '
-                                 f'tenant: {tenant.id} with error: {e}')
+        current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
        raise


-def compress_audio(file_location, input_file, output_file, tenant):
+def compress_audio(tenant_id, document_version, input_file, output_file):
    try:
-        current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
+        current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')

-        # Run the compression script
-        result = subprocess.run(
-            ['scripts/compress.sh', '-d', file_location, '-i', input_file, '-o', output_file],
-            capture_output=True,
-            text=True
-        )
+        input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
+                                            document_version.id, input_file)

-        if result.returncode != 0:
-            raise Exception(f"Compression failed: {result.stderr}")
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
+            temp_input.write(input_data)
+            temp_input.flush()

-        output_file_path = os.path.join(file_location, output_file)
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
+                result = subprocess.run(
+                    ['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
+                    capture_output=True,
+                    text=True
+                )

-        # Additional check for file stability
-        previous_size = -1
-        stable_count = 0
-        max_attempts = 12  # 1 minute total wait time
+                if result.returncode != 0:
+                    raise Exception(f"Compression failed: {result.stderr}")

-        for _ in range(max_attempts):
-            if os.path.exists(output_file_path):
-                current_size = os.path.getsize(output_file_path)
-                if current_size == previous_size:
-                    stable_count += 1
-                    if stable_count >= 3:  # File size hasn't changed for 3 checks
-                        break
-                else:
-                    stable_count = 0
-                previous_size = current_size
-            gevent.sleep(5)
+                with open(temp_output.name, 'rb') as f:
+                    compressed_data = f.read()

-        if stable_count < 3:
-            raise Exception("File size did not stabilize within the expected time")
+        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
+                             output_file, compressed_data)

-        current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
-        return output_file_path
+        current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
    except Exception as e:
-        current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
+        current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
        raise


-def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables):
+def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
    try:
-        current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}')
+        current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
        client = model_variables['transcription_client']
        model = model_variables['transcription_model']
-        input_file_path = os.path.join(file_location, input_file)
-        output_file_path = os.path.join(file_location, output_file)

-        # Wait for the input file to exist
-        count = 0
-        while not os.path.exists(input_file_path) and count < 10:
-            gevent.sleep(1)
-            current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
-            count += 1
+        # Download the audio file from MinIO
+        audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
+                                            document_version.id, input_file)

-        if not os.path.exists(input_file_path):
-            raise FileNotFoundError(f"Input file {input_file_path} not found after waiting.")
-
-        # Load the audio file
-        audio = AudioSegment.from_file(input_file_path)
+        # Load the audio data into pydub
+        audio = AudioSegment.from_mp3(io.BytesIO(audio_data))

        # Define segment length (e.g., 10 minutes)
        segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
@@ -512,14 +475,16 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m

        # Split audio into segments and transcribe each
        for i, chunk in enumerate(audio[::segment_length]):
-            current_app.logger.debug(f'Transcribing chunk {i} of {len(audio) // segment_length} ')
+            current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
+
            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
                chunk.export(temp_audio.name, format="mp3")
+
                with open(temp_audio.name, 'rb') as audio_segment:
                    transcription = client.audio.transcriptions.create(
                        file=audio_segment,
                        model=model,
-                        language=language,
+                        language=document_version.language,
                        response_format='verbose_json',
                    )

@@ -530,20 +495,25 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
        # Combine all transcriptions
        full_transcription = " ".join(transcriptions)

-        # Write the full transcription to the output file
-        with open(output_file_path, 'w') as f:
-            f.write(full_transcription)
+        # Upload the full transcription to MinIO
+        minio_client.upload_document_file(
+            tenant_id,
+            document_version.doc_id,
+            document_version.language,
+            document_version.id,
+            output_file,
+            full_transcription.encode('utf-8')
+        )

-        current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
+        current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
    except Exception as e:
-        current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, '
-                                 f'with error: {e}')
+        current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
        raise


-def annotate_transcription(file_location, input_file, output_file, language, tenant, model_variables):
+def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
    try:
-        current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
+        current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')

        char_splitter = CharacterTextSplitter(separator='.',
                                              chunk_size=model_variables['annotation_chunk_length'],
@@ -552,18 +522,21 @@ def annotate_transcription(file_location, input_file, output_file, language, ten
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
-            # ("###", "Header 3"),
        ]
        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)

        llm = model_variables['llm']
        template = model_variables['transcript_template']
-        language_template = create_language_template(template, language)
+        language_template = create_language_template(template, document_version.language)
        transcript_prompt = ChatPromptTemplate.from_template(language_template)
        setup = RunnablePassthrough()
        output_parser = StrOutputParser()
-        with open(os.path.join(file_location, input_file), 'r') as f:
-            transcript = f.read()
+
+        # Download the transcription file from MinIO
+        transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
+                                                              document_version.language, document_version.id,
+                                                              input_file)
+        transcript = transcript_data.decode('utf-8')

        chain = setup | transcript_prompt | llm | output_parser

@@ -598,38 +571,53 @@ def annotate_transcription(file_location, input_file, output_file, language, ten
            markdown_chunks.pop()
            all_markdown_chunks += markdown_chunks

-
        all_markdown_chunks += [last_markdown_chunk]

        annotated_transcript = '\n'.join(all_markdown_chunks)

-        with open(os.path.join(file_location, output_file), 'w') as f:
-            f.write(annotated_transcript)
+        # Upload the annotated transcript to MinIO
+        minio_client.upload_document_file(
+            tenant.id,
+            document_version.doc_id,
+            document_version.language,
+            document_version.id,
+            output_file,
+            annotated_transcript.encode('utf-8')
+        )

-        current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}')
+        current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
    except Exception as e:
-        current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, '
-                                 f'with error: {e}')
+        current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
        raise


-def create_potential_chunks_for_markdown(base_path, input_file, tenant):
-    current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}')
-    markdown = ''
-    with open(os.path.join(base_path, input_file), 'r') as f:
-        markdown = f.read()
+def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
+    try:
+        current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')

-    headers_to_split_on = [
-        ("#", "Header 1"),
-        ("##", "Header 2"),
-        # ("###", "Header 3"),
-    ]
+        # Download the markdown file from MinIO
+        markdown_data = minio_client.download_document_file(tenant_id,
+                                                            document_version.doc_id,
+                                                            document_version.language,
+                                                            document_version.id,
+                                                            input_file
+                                                            )
+        markdown = markdown_data.decode('utf-8')

-    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
-    md_header_splits = markdown_splitter.split_text(markdown)
-    potential_chunks = [doc.page_content for doc in md_header_splits]
+        headers_to_split_on = [
+            ("#", "Header 1"),
+            ("##", "Header 2"),
+        ]

-    return potential_chunks
+        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
+        md_header_splits = markdown_splitter.split_text(markdown)
+        potential_chunks = [doc.page_content for doc in md_header_splits]
+
+        current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
+        return potential_chunks
+    except Exception as e:
+        current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
+        raise


 def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):