Start log tracing to log business events. Storage in both database and logging-backend.

2024-09-25 15:39:25 +02:00
parent a740c96630
commit ee1b0f1cfa
8 changed files with 370 additions and 321 deletions
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -24,6 +24,9 @@ from eveai_workers.Processors.html_processor import HTMLProcessor
 from eveai_workers.Processors.pdf_processor import PDFProcessor
 from eveai_workers.Processors.srt_processor import SRTProcessor

+from common.utils.business_event import BusinessEvent
+from common.utils.business_event_context import current_event
+

 # Healthcheck task
@current_celery.task(name='ping', queue='embeddings')
@@ -33,76 +36,80 @@ def ping():

@current_celery.task(name='create_embeddings', queue='embeddings')
 def create_embeddings(tenant_id, document_version_id):
-    current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.')
+    # BusinessEvent creates a context, which is why we need to use it with a with block
+    with BusinessEvent('Create Embeddings', tenant_id, document_version_id=document_version_id):
+        current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}')
+        current_event.log("Starting Embedding Creation Task")

-    try:
-        # Retrieve Tenant for which we are processing
-        tenant = Tenant.query.get(tenant_id)
-        if tenant is None:
-            raise Exception(f'Tenant {tenant_id} not found')
+        try:
+            # Retrieve Tenant for which we are processing
+            tenant = Tenant.query.get(tenant_id)
+            if tenant is None:
+                raise Exception(f'Tenant {tenant_id} not found')

-        # Ensure we are working in the correct database schema
-        Database(tenant_id).switch_schema()
+            # Ensure we are working in the correct database schema
+            Database(tenant_id).switch_schema()

-        # Select variables to work with depending on tenant and model
-        model_variables = select_model_variables(tenant)
-        current_app.logger.debug(f'Model variables: {model_variables}')
+            # Select variables to work with depending on tenant and model
+            model_variables = select_model_variables(tenant)
+            current_app.logger.debug(f'Model variables: {model_variables}')

-        # Retrieve document version to process
-        document_version = DocumentVersion.query.get(document_version_id)
-        if document_version is None:
-            raise Exception(f'Document version {document_version_id} not found')
+            # Retrieve document version to process
+            document_version = DocumentVersion.query.get(document_version_id)
+            if document_version is None:
+                raise Exception(f'Document version {document_version_id} not found')

-    except Exception as e:
-        current_app.logger.error(f'Create Embeddings request received '
-                                 f'for non existing document version {document_version_id} '
-                                 f'for tenant {tenant_id}, '
-                                 f'error: {e}')
-        raise
+        except Exception as e:
+            current_app.logger.error(f'Create Embeddings request received '
+                                     f'for non existing document version {document_version_id} '
+                                     f'for tenant {tenant_id}, '
+                                     f'error: {e}')
+            raise

-    try:
-        db.session.add(document_version)
+        try:
+            db.session.add(document_version)

-        # start processing
-        document_version.processing = True
-        document_version.processing_started_at = dt.now(tz.utc)
-        document_version.processing_finished_at = None
-        document_version.processing_error = None
+            # start processing
+            document_version.processing = True
+            document_version.processing_started_at = dt.now(tz.utc)
+            document_version.processing_finished_at = None
+            document_version.processing_error = None

-        db.session.commit()
-    except SQLAlchemyError as e:
-        current_app.logger.error(f'Unable to save Embedding status information '
-                                 f'in document version {document_version_id} '
-                                 f'for tenant {tenant_id}')
-        raise
+            db.session.commit()
+        except SQLAlchemyError as e:
+            current_app.logger.error(f'Unable to save Embedding status information '
+                                     f'in document version {document_version_id} '
+                                     f'for tenant {tenant_id}')
+            raise

-    delete_embeddings_for_document_version(document_version)
+        delete_embeddings_for_document_version(document_version)

-    try:
-        match document_version.file_type:
-            case 'pdf':
-                process_pdf(tenant, model_variables, document_version)
-            case 'html':
-                process_html(tenant, model_variables, document_version)
-            case 'srt':
-                process_srt(tenant, model_variables, document_version)
-            case 'mp4' | 'mp3' | 'ogg':
-                process_audio(tenant, model_variables, document_version)
-            case _:
-                raise Exception(f'No functionality defined for file type {document_version.file_type} '
-                                f'for tenant {tenant_id} '
-                                f'while creating embeddings for document version {document_version_id}')
+        try:
+            match document_version.file_type:
+                case 'pdf':
+                    process_pdf(tenant, model_variables, document_version)
+                case 'html':
+                    process_html(tenant, model_variables, document_version)
+                case 'srt':
+                    process_srt(tenant, model_variables, document_version)
+                case 'mp4' | 'mp3' | 'ogg':
+                    process_audio(tenant, model_variables, document_version)
+                case _:
+                    raise Exception(f'No functionality defined for file type {document_version.file_type} '
+                                    f'for tenant {tenant_id} '
+                                    f'while creating embeddings for document version {document_version_id}')
+            current_event.log("Finished Embedding Creation Task")

-    except Exception as e:
-        current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
-                                 f'on document version {document_version_id} '
-                                 f'error: {e}')
-        document_version.processing = False
-        document_version.processing_finished_at = dt.now(tz.utc)
-        document_version.processing_error = str(e)[:255]
-        db.session.commit()
-        create_embeddings.update_state(state=states.FAILURE)
-        raise
+        except Exception as e:
+            current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} '
+                                     f'on document version {document_version_id} '
+                                     f'error: {e}')
+            document_version.processing = False
+            document_version.processing_finished_at = dt.now(tz.utc)
+            document_version.processing_error = str(e)[:255]
+            db.session.commit()
+            create_embeddings.update_state(state=states.FAILURE)
+            raise


 def delete_embeddings_for_document_version(document_version):
@@ -118,38 +125,48 @@ def delete_embeddings_for_document_version(document_version):


 def process_pdf(tenant, model_variables, document_version):
+    current_event.log("Starting PDF Processing")
    processor = PDFProcessor(tenant, model_variables, document_version)
    markdown, title = processor.process()

    # Process markdown and embed
    embed_markdown(tenant, model_variables, document_version, markdown, title)
+    current_event.log("Finished PDF Processing")


 def process_html(tenant, model_variables, document_version):
-    processor = HTMLProcessor(tenant, model_variables, document_version)
-    markdown, title = processor.process()
+    with current_event.create_span("HTML Processing"):
+        processor = HTMLProcessor(tenant, model_variables, document_version)
+        markdown, title = processor.process()

    # Process markdown and embed
-    embed_markdown(tenant, model_variables, document_version, markdown, title)
+    with current_event.create_span("Embedding"):
+        embed_markdown(tenant, model_variables, document_version, markdown, title)
+


 def process_audio(tenant, model_variables, document_version):
+    current_event.log("Starting Audio Processing")
    processor = AudioProcessor(tenant, model_variables, document_version)
    markdown, title = processor.process()

    # Process markdown and embed
    embed_markdown(tenant, model_variables, document_version, markdown, title)
+    current_event.log("Finished Audio Processing")


 def process_srt(tenant, model_variables, document_version):
+    current_event.log("Starting SRT Processing")
    processor = SRTProcessor(tenant, model_variables, document_version)
    markdown, title = processor.process()

    # Process markdown and embed
    embed_markdown(tenant, model_variables, document_version, markdown, title)
+    current_event.log("Finished SRT Processing")


 def embed_markdown(tenant, model_variables, document_version, markdown, title):
+    current_event.log("Starting Embedding Markdown Processing")
    # Create potential chunks
    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")

@@ -178,9 +195,11 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):

    current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
                            f'on document version {document_version.id} :-)')
+    current_event.log("Finished Embedding Markdown Processing")


 def enrich_chunks(tenant, model_variables, document_version, title, chunks):
+    current_event.log("Starting Enriching Chunks Processing")
    current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
                             f'on document version {document_version.id}')

@@ -213,11 +232,13 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):

    current_app.logger.debug(f'Finished enriching chunks for tenant {tenant.id} '
                             f'on document version {document_version.id}')
+    current_event.log("Finished Enriching Chunks Processing")

    return enriched_chunks


 def summarize_chunk(tenant, model_variables, document_version, chunk):
+    current_event.log("Starting Summarizing Chunk Processing")
    current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
                             f'on document version {document_version.id}')
    llm = model_variables['llm']
@@ -235,6 +256,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
        summary = chain.invoke({"text": chunk})
        current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
                                 f'on document version {document_version.id}.')
+        current_event.log("Finished summarizing chunk for tenant ")
        return summary
    except LangChainException as e:
        current_app.logger.error(f'Error creating summary for chunk enrichment for tenant {tenant.id} '
@@ -244,6 +266,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):


 def embed_chunks(tenant, model_variables, document_version, chunks):
+    current_event.log("Starting Embedding Chunks Processing")
    current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
                             f'on document version {document_version.id}')
    embedding_model = model_variables['embedding_model']
@@ -268,6 +291,8 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
        new_embedding.embedding = embedding
        new_embeddings.append(new_embedding)

+    current_app.logger.debug(f'Finished embedding chunks for tenant {tenant.id} ')
+
    return new_embeddings


@@ -281,244 +306,6 @@ def log_parsing_info(tenant, tags, included_elements, excluded_elements, exclude
        current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')


-# def process_youtube(tenant, model_variables, document_version):
-#     download_file_name = f'{document_version.id}.mp4'
-#     compressed_file_name = f'{document_version.id}.mp3'
-#     transcription_file_name = f'{document_version.id}.txt'
-#     markdown_file_name = f'{document_version.id}.md'
-#
-#     # Remove existing files (in case of a re-processing of the file
-#     minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
-#                                       document_version.id, download_file_name)
-#     minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
-#                                       document_version.id, compressed_file_name)
-#     minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
-#                                       document_version.id, transcription_file_name)
-#     minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
-#                                       document_version.id, markdown_file_name)
-#
-#     of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
-#                                                       download_file_name)
-#     document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
-#     compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
-#     transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
-#     annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)
-#
-#     potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
-#     actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
-#                                                 model_variables['max_chunk_size'])
-#
-#     enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
-#     embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
-#
-#     try:
-#         db.session.add(document_version)
-#         document_version.processing_finished_at = dt.now(tz.utc)
-#         document_version.processing = False
-#         db.session.add_all(embeddings)
-#         db.session.commit()
-#     except SQLAlchemyError as e:
-#         current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
-#                                  f'on Youtube document version {document_version.id}'
-#                                  f'error: {e}')
-#         raise
-#
-#     current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
-#                             f'on Youtube document version {document_version.id} :-)')
-#
-#
-# def download_youtube(url, tenant_id, document_version, file_name):
-#     try:
-#         current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
-#         yt = YouTube(url)
-#         stream = yt.streams.get_audio_only()
-#
-#         with tempfile.NamedTemporaryFile(delete=False) as temp_file:
-#             stream.download(output_path=temp_file.name)
-#             with open(temp_file.name, 'rb') as f:
-#                 file_data = f.read()
-#
-#         minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
-#                                           document_version.id,
-#                                           file_name, file_data)
-#
-#         current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
-#         return file_name, yt.title, yt.description, yt.author
-#     except Exception as e:
-#         current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
-#         raise
-#
-#
-# def compress_audio(tenant_id, document_version, input_file, output_file):
-#     try:
-#         current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
-#
-#         input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
-#                                                          document_version.id, input_file)
-#
-#         with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
-#             temp_input.write(input_data)
-#             temp_input.flush()
-#
-#             with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
-#                 result = subprocess.run(
-#                     ['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
-#                     capture_output=True,
-#                     text=True
-#                 )
-#
-#                 if result.returncode != 0:
-#                     raise Exception(f"Compression failed: {result.stderr}")
-#
-#                 with open(temp_output.name, 'rb') as f:
-#                     compressed_data = f.read()
-#
-#         minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
-#                                           document_version.id,
-#                                           output_file, compressed_data)
-#
-#         current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
-#     except Exception as e:
-#         current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
-#         raise
-#
-#
-# def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
-#     try:
-#         current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
-#         client = model_variables['transcription_client']
-#         model = model_variables['transcription_model']
-#
-#         # Download the audio file from MinIO
-#         audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
-#                                                          document_version.id, input_file)
-#
-#         # Load the audio data into pydub
-#         audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
-#
-#         # Define segment length (e.g., 10 minutes)
-#         segment_length = 10 * 60 * 1000  # 10 minutes in milliseconds
-#
-#         transcriptions = []
-#
-#         # Split audio into segments and transcribe each
-#         for i, chunk in enumerate(audio[::segment_length]):
-#             current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
-#
-#             with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
-#                 chunk.export(temp_audio.name, format="mp3")
-#
-#                 with open(temp_audio.name, 'rb') as audio_segment:
-#                     transcription = client.audio.transcriptions.create(
-#                         file=audio_segment,
-#                         model=model,
-#                         language=document_version.language,
-#                         response_format='verbose_json',
-#                     )
-#
-#                 transcriptions.append(transcription.text)
-#
-#             os.unlink(temp_audio.name)  # Delete the temporary file
-#
-#         # Combine all transcriptions
-#         full_transcription = " ".join(transcriptions)
-#
-#         # Upload the full transcription to MinIO
-#         minio_client.upload_document_file(
-#             tenant_id,
-#             document_version.doc_id,
-#             document_version.language,
-#             document_version.id,
-#             output_file,
-#             full_transcription.encode('utf-8')
-#         )
-#
-#         current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
-#     except Exception as e:
-#         current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
-#         raise
-#
-#
-# def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
-#     try:
-#         current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')
-#
-#         char_splitter = CharacterTextSplitter(separator='.',
-#                                               chunk_size=model_variables['annotation_chunk_length'],
-#                                               chunk_overlap=0)
-#
-#         headers_to_split_on = [
-#             ("#", "Header 1"),
-#             ("##", "Header 2"),
-#         ]
-#         markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
-#
-#         llm = model_variables['llm']
-#         template = model_variables['transcript_template']
-#         language_template = create_language_template(template, document_version.language)
-#         transcript_prompt = ChatPromptTemplate.from_template(language_template)
-#         setup = RunnablePassthrough()
-#         output_parser = StrOutputParser()
-#
-#         # Download the transcription file from MinIO
-#         transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
-#                                                               document_version.language, document_version.id,
-#                                                               input_file)
-#         transcript = transcript_data.decode('utf-8')
-#
-#         chain = setup | transcript_prompt | llm | output_parser
-#
-#         chunks = char_splitter.split_text(transcript)
-#         all_markdown_chunks = []
-#         last_markdown_chunk = ''
-#         for chunk in chunks:
-#             current_app.logger.debug(f'Annotating next chunk of {len(chunks)} for tenant {tenant.id}')
-#             full_input = last_markdown_chunk + '\n' + chunk
-#             if tenant.embed_tuning:
-#                 current_app.embed_tuning_logger.debug(f'Annotating chunk: \n '
-#                                                       f'------------------\n'
-#                                                       f'{full_input}\n'
-#                                                       f'------------------\n')
-#             input_transcript = {'transcript': full_input}
-#             markdown = chain.invoke(input_transcript)
-#             # GPT-4o returns some kind of content description: ```markdown <text> ```
-#             if markdown.startswith("```markdown"):
-#                 markdown = "\n".join(markdown.strip().split("\n")[1:-1])
-#             if tenant.embed_tuning:
-#                 current_app.embed_tuning_logger.debug(f'Markdown Received: \n '
-#                                                       f'------------------\n'
-#                                                       f'{markdown}\n'
-#                                                       f'------------------\n')
-#             md_header_splits = markdown_splitter.split_text(markdown)
-#             markdown_chunks = [doc.page_content for doc in md_header_splits]
-#             # claude-3.5-sonnet returns introductory text
-#             if not markdown_chunks[0].startswith('#'):
-#                 markdown_chunks.pop(0)
-#             last_markdown_chunk = markdown_chunks[-1]
-#             last_markdown_chunk = "\n".join(markdown.strip().split("\n")[1:])
-#             markdown_chunks.pop()
-#             all_markdown_chunks += markdown_chunks
-#
-#         all_markdown_chunks += [last_markdown_chunk]
-#
-#         annotated_transcript = '\n'.join(all_markdown_chunks)
-#
-#         # Upload the annotated transcript to MinIO
-#         minio_client.upload_document_file(
-#             tenant.id,
-#             document_version.doc_id,
-#             document_version.language,
-#             document_version.id,
-#             output_file,
-#             annotated_transcript.encode('utf-8')
-#         )
-#
-#         current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
-#     except Exception as e:
-#         current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
-#         raise
-
-
 def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
    try:
        current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')