Business event tracing completed for both eveai_workers tasks and eveai_chat_workers tasks
This commit is contained in:
@@ -7,6 +7,7 @@ from common.extensions import minio_client
|
||||
import subprocess
|
||||
|
||||
from .transcription_processor import TranscriptionProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class AudioProcessor(TranscriptionProcessor):
|
||||
@@ -24,8 +25,13 @@ class AudioProcessor(TranscriptionProcessor):
|
||||
self.document_version.id,
|
||||
self.document_version.file_name
|
||||
)
|
||||
compressed_audio = self._compress_audio(file_data)
|
||||
return self._transcribe_audio(compressed_audio)
|
||||
|
||||
with current_event.create_span("Audio Processing"):
|
||||
compressed_audio = self._compress_audio(file_data)
|
||||
with current_event.create_span("Transcription Generation"):
|
||||
transcription = self._transcribe_audio(compressed_audio)
|
||||
|
||||
return transcription
|
||||
|
||||
def _compress_audio(self, audio_data):
|
||||
self._log("Compressing audio")
|
||||
|
||||
@@ -31,8 +31,10 @@ class HTMLProcessor(Processor):
|
||||
)
|
||||
html_content = file_data.decode('utf-8')
|
||||
|
||||
extracted_html, title = self._parse_html(html_content)
|
||||
markdown = self._generate_markdown_from_html(extracted_html)
|
||||
with current_event.create_span("HTML Content Extraction"):
|
||||
extracted_html, title = self._parse_html(html_content)
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
markdown = self._generate_markdown_from_html(extracted_html)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing HTML")
|
||||
|
||||
@@ -10,6 +10,7 @@ from langchain_core.runnables import RunnablePassthrough
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class PDFProcessor(Processor):
|
||||
@@ -32,13 +33,14 @@ class PDFProcessor(Processor):
|
||||
self.document_version.file_name
|
||||
)
|
||||
|
||||
extracted_content = self._extract_content(file_data)
|
||||
structured_content, title = self._structure_content(extracted_content)
|
||||
with current_event.create_span("PDF Extraction"):
|
||||
extracted_content = self._extract_content(file_data)
|
||||
structured_content, title = self._structure_content(extracted_content)
|
||||
|
||||
llm_chunks = self._split_content_for_llm(structured_content)
|
||||
markdown = self._process_chunks_with_llm(llm_chunks)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
llm_chunks = self._split_content_for_llm(structured_content)
|
||||
markdown = self._process_chunks_with_llm(llm_chunks)
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing PDF")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
# transcription_processor.py
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class TranscriptionProcessor(Processor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
@@ -16,12 +18,14 @@ class TranscriptionProcessor(Processor):
|
||||
def process(self):
|
||||
self._log("Starting Transcription processing")
|
||||
try:
|
||||
transcription = self._get_transcription()
|
||||
chunks = self._chunk_transcription(transcription)
|
||||
markdown_chunks = self._process_chunks(chunks)
|
||||
full_markdown = self._combine_markdown_chunks(markdown_chunks)
|
||||
self._save_markdown(full_markdown)
|
||||
self._log("Finished processing Transcription")
|
||||
with current_event.create_span("Transcription Generation"):
|
||||
transcription = self._get_transcription()
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
chunks = self._chunk_transcription(transcription)
|
||||
markdown_chunks = self._process_chunks(chunks)
|
||||
full_markdown = self._combine_markdown_chunks(markdown_chunks)
|
||||
self._save_markdown(full_markdown)
|
||||
self._log("Finished processing Transcription")
|
||||
return full_markdown, self._extract_title_from_markdown(full_markdown)
|
||||
except Exception as e:
|
||||
self._log(f"Error processing Transcription: {str(e)}", level='error')
|
||||
|
||||
@@ -39,8 +39,6 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
# BusinessEvent creates a context, which is why we need to use it with a with block
|
||||
with BusinessEvent('Create Embeddings', tenant_id, document_version_id=document_version_id):
|
||||
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}')
|
||||
current_event.log("Starting Embedding Creation Task")
|
||||
|
||||
try:
|
||||
# Retrieve Tenant for which we are processing
|
||||
tenant = Tenant.query.get(tenant_id)
|
||||
@@ -125,13 +123,13 @@ def delete_embeddings_for_document_version(document_version):
|
||||
|
||||
|
||||
def process_pdf(tenant, model_variables, document_version):
|
||||
current_event.log("Starting PDF Processing")
|
||||
processor = PDFProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
with current_event.create_span("PDF Processing"):
|
||||
processor = PDFProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
current_event.log("Finished PDF Processing")
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def process_html(tenant, model_variables, document_version):
|
||||
@@ -144,29 +142,27 @@ def process_html(tenant, model_variables, document_version):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
|
||||
def process_audio(tenant, model_variables, document_version):
|
||||
current_event.log("Starting Audio Processing")
|
||||
processor = AudioProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
with current_event.create_span("Audio Processing"):
|
||||
processor = AudioProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
current_event.log("Finished Audio Processing")
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def process_srt(tenant, model_variables, document_version):
|
||||
current_event.log("Starting SRT Processing")
|
||||
processor = SRTProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
with current_event.create_span("SRT Processing"):
|
||||
processor = SRTProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process()
|
||||
|
||||
# Process markdown and embed
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
current_event.log("Finished SRT Processing")
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, markdown, title)
|
||||
|
||||
|
||||
def embed_markdown(tenant, model_variables, document_version, markdown, title):
|
||||
current_event.log("Starting Embedding Markdown Processing")
|
||||
# Create potential chunks
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
|
||||
|
||||
@@ -195,7 +191,6 @@ def embed_markdown(tenant, model_variables, document_version, markdown, title):
|
||||
|
||||
current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
|
||||
f'on document version {document_version.id} :-)')
|
||||
current_event.log("Finished Embedding Markdown Processing")
|
||||
|
||||
|
||||
def enrich_chunks(tenant, model_variables, document_version, title, chunks):
|
||||
@@ -238,7 +233,7 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
|
||||
|
||||
|
||||
def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
current_event.log("Starting Summarizing Chunk Processing")
|
||||
current_event.log("Starting Summarizing Chunk")
|
||||
current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
llm = model_variables['llm']
|
||||
@@ -256,7 +251,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
summary = chain.invoke({"text": chunk})
|
||||
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}.')
|
||||
current_event.log("Finished summarizing chunk for tenant ")
|
||||
current_event.log("Finished Summarizing Chunk")
|
||||
return summary
|
||||
except LangChainException as e:
|
||||
current_app.logger.error(f'Error creating summary for chunk enrichment for tenant {tenant.id} '
|
||||
|
||||
Reference in New Issue
Block a user