- Improve annotation algorithm for Youtube (and others)

- Patch Pytube - improve OS deletion of files and writing of files - Start working on Claude - Improve template management
2024-07-16 14:21:49 +02:00
parent db44fd3b66
commit 908a2eaf7e
39 changed files with 6427 additions and 324 deletions
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -1,5 +1,7 @@
 import os
 from datetime import datetime as dt, timezone as tz
+import subprocess
+

 import gevent
 from bs4 import BeautifulSoup
@@ -16,6 +18,8 @@ from langchain_core.runnables import RunnablePassthrough
 from sqlalchemy.exc import SQLAlchemyError
 from pytube import YouTube
 import PyPDF2
+from pydub import AudioSegment
+import tempfile

 from common.extensions import db
 from common.models.document import DocumentVersion, Embedding
@@ -23,6 +27,7 @@ from common.models.user import Tenant
 from common.utils.celery_utils import current_celery
 from common.utils.database import Database
 from common.utils.model_utils import select_model_variables, create_language_template
+from common.utils.os_utils import safe_remove, sync_folder


@current_celery.task(name='create_embeddings', queue='embeddings')
@@ -193,7 +198,7 @@ def process_html(tenant, model_variables, document_version):
        create_embeddings.update_state(state=states.FAILURE)
        raise

-    extracted_html, title = parse_html(html_content, html_tags, included_elements=html_included_elements,
+    extracted_html, title = parse_html(tenant, html_content, html_tags, included_elements=html_included_elements,
                                       excluded_elements=html_excluded_elements)
    extracted_file_name = f'{document_version.id}-extracted.html'
    output_file = os.path.join(base_path, extracted_file_name)
@@ -345,7 +350,7 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
    return new_embeddings


-def parse_html(html_content, tags, included_elements=None, excluded_elements=None):
+def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
    soup = BeautifulSoup(html_content, 'html.parser')
    extracted_html = ''

@@ -354,18 +359,22 @@ def parse_html(html_content, tags, included_elements=None, excluded_elements=Non
    else:
        elements_to_parse = [soup]  # parse the entire document if no included_elements specified

-    current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
-    current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
-    current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
-    current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
+    if tenant.embed_tuning:
+        current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
+        current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
+        current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
+        current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
+        current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
+        current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')

    # Iterate through the found included elements
    for element in elements_to_parse:
        # Find all specified tags within each included element
        for sub_element in element.find_all(tags):
+            if tenant.embed_tuning:
+                current_app.embed_tuning_logger.debug(f'Found element: {sub_element.name}')
            if excluded_elements and sub_element.find_parent(excluded_elements):
                continue  # Skip this sub_element if it's within any of the excluded_elements
-            sub_content = html.unescape(sub_element.get_text(strip=False))
            extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}</{sub_element.name}>\n'

    title = soup.find('title').get_text(strip=True)
@@ -381,11 +390,20 @@ def process_youtube(tenant, model_variables, document_version):
    transcription_file_name = f'{document_version.id}.txt'
    markdown_file_name = f'{document_version.id}.md'

+    # Remove existing files (in case of a re-processing of the file
+    safe_remove(os.path.join(base_path, download_file_name))
+    safe_remove(os.path.join(base_path, compressed_file_name))
+    safe_remove(os.path.join(base_path, transcription_file_name))
+    safe_remove(os.path.join(base_path, markdown_file_name))
+    sync_folder(base_path)
+
    of, title, description, author = download_youtube(document_version.url, base_path, download_file_name, tenant)
    document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
    compress_audio(base_path, download_file_name, compressed_file_name, tenant)
-    transcribe_audio(base_path, compressed_file_name, transcription_file_name, document_version.language, tenant, model_variables)
-    annotate_transcription(base_path, transcription_file_name, markdown_file_name, tenant, model_variables)
+    transcribe_audio(base_path, compressed_file_name, transcription_file_name,
+                     document_version.language, tenant, model_variables)
+    annotate_transcription(base_path, transcription_file_name, markdown_file_name,
+                           document_version.language, tenant, model_variables)

    potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
    actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
@@ -426,15 +444,41 @@ def download_youtube(url, file_location, file_name, tenant):
 def compress_audio(file_location, input_file, output_file, tenant):
    try:
        current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
-        result = os.popen(f'scripts/compress.sh -d {file_location} -i {input_file} -o {output_file}')
+
+        # Run the compression script
+        result = subprocess.run(
+            ['scripts/compress.sh', '-d', file_location, '-i', input_file, '-o', output_file],
+            capture_output=True,
+            text=True
+        )
+
+        if result.returncode != 0:
+            raise Exception(f"Compression failed: {result.stderr}")
+
        output_file_path = os.path.join(file_location, output_file)
-        count = 0
-        while not os.path.exists(output_file_path) and count < 10:
-            gevent.sleep(1)
-            current_app.logger.debug(f'Waiting for {output_file_path} to be created... Count: {count}')
-            count += 1
+
+        # Additional check for file stability
+        previous_size = -1
+        stable_count = 0
+        max_attempts = 12  # 1 minute total wait time
+
+        for _ in range(max_attempts):
+            if os.path.exists(output_file_path):
+                current_size = os.path.getsize(output_file_path)
+                if current_size == previous_size:
+                    stable_count += 1
+                    if stable_count >= 3:  # File size hasn't changed for 3 checks
+                        break
+                else:
+                    stable_count = 0
+                previous_size = current_size
+            gevent.sleep(5)
+
+        if stable_count < 3:
+            raise Exception("File size did not stabilize within the expected time")
+
        current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
-        return result
+        return output_file_path
    except Exception as e:
        current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
        raise
@@ -448,22 +492,47 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
        input_file_path = os.path.join(file_location, input_file)
        output_file_path = os.path.join(file_location, output_file)

+        # Wait for the input file to exist
        count = 0
        while not os.path.exists(input_file_path) and count < 10:
            gevent.sleep(1)
            current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
            count += 1

-        with open(input_file_path, 'rb') as audio_file:
-            transcription = client.audio.transcriptions.create(
-                file=audio_file,
-                model=model,
-                language=language,
-                response_format='verbose_json',
-            )
+        if not os.path.exists(input_file_path):
+            raise FileNotFoundError(f"Input file {input_file_path} not found after waiting.")

-        with open(output_file_path, 'w') as transcript_file:
-            transcript_file.write(transcription.text)
+        # Load the audio file
+        audio = AudioSegment.from_file(input_file_path)
+
+        # Define segment length (e.g., 10 minutes)
+        segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
+
+        transcriptions = []
+
+        # Split audio into segments and transcribe each
+        for i, chunk in enumerate(audio[::segment_length]):
+            current_app.logger.debug(f'Transcribing chunk {i} of {len(audio) // segment_length} ')
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
+                chunk.export(temp_audio.name, format="mp3")
+                with open(temp_audio.name, 'rb') as audio_segment:
+                    transcription = client.audio.transcriptions.create(
+                        file=audio_segment,
+                        model=model,
+                        language=language,
+                        response_format='verbose_json',
+                    )
+
+                transcriptions.append(transcription.text)
+
+            os.unlink(temp_audio.name)  # Delete the temporary file
+
+        # Combine all transcriptions
+        full_transcription = " ".join(transcriptions)
+
+        # Write the full transcription to the output file
+        with open(output_file_path, 'w') as f:
+            f.write(full_transcription)

        current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
    except Exception as e:
@@ -472,23 +541,67 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
        raise


-def annotate_transcription(file_location, input_file, output_file, tenant, model_variables):
+def annotate_transcription(file_location, input_file, output_file, language, tenant, model_variables):
    try:
        current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
-        llm = model_variables['llm']

+        char_splitter = CharacterTextSplitter(separator='.',
+                                              chunk_size=model_variables['annotation_chunk_length'],
+                                              chunk_overlap=0)
+
+        headers_to_split_on = [
+            ("#", "Header 1"),
+            ("##", "Header 2"),
+            # ("###", "Header 3"),
+        ]
+        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
+
+        llm = model_variables['llm']
        template = model_variables['transcript_template']
-        transcript_prompt = ChatPromptTemplate.from_template(template)
+        language_template = create_language_template(template, language)
+        transcript_prompt = ChatPromptTemplate.from_template(language_template)
        setup = RunnablePassthrough()
        output_parser = StrOutputParser()
-        transcript = ''
        with open(os.path.join(file_location, input_file), 'r') as f:
            transcript = f.read()

        chain = setup | transcript_prompt | llm | output_parser
-        input_transcript = {"transcript": transcript}

-        annotated_transcript = chain.invoke(input_transcript)
+        chunks = char_splitter.split_text(transcript)
+        all_markdown_chunks = []
+        last_markdown_chunk = ''
+        for chunk in chunks:
+            current_app.logger.debug(f'Annotating next chunk of {len(chunks)} for tenant {tenant.id}')
+            full_input = last_markdown_chunk + '\n' + chunk
+            if tenant.embed_tuning:
+                current_app.embed_tuning_logger.debug(f'Annotating chunk: \n '
+                                                      f'------------------\n'
+                                                      f'{full_input}\n'
+                                                      f'------------------\n')
+            input_transcript = {'transcript': full_input}
+            markdown = chain.invoke(input_transcript)
+            # GPT-4o returns some kind of content description: ```markdown <text> ```
+            if markdown.startswith("```markdown"):
+                markdown = "\n".join(markdown.strip().split("\n")[1:-1])
+            if tenant.embed_tuning:
+                current_app.embed_tuning_logger.debug(f'Markdown Received: \n '
+                                                      f'------------------\n'
+                                                      f'{markdown}\n'
+                                                      f'------------------\n')
+            md_header_splits = markdown_splitter.split_text(markdown)
+            markdown_chunks = [doc.page_content for doc in md_header_splits]
+            # claude-3.5-sonnet returns introductory text
+            if not markdown_chunks[0].startswith('#'):
+                markdown_chunks.pop(0)
+            last_markdown_chunk = markdown_chunks[-1]
+            last_markdown_chunk = "\n".join(markdown.strip().split("\n")[1:])
+            markdown_chunks.pop()
+            all_markdown_chunks += markdown_chunks
+
+
+        all_markdown_chunks += [last_markdown_chunk]
+
+        annotated_transcript = '\n'.join(all_markdown_chunks)

        with open(os.path.join(file_location, output_file), 'w') as f:
            f.write(annotated_transcript)