- Improvements to enable deployment in the cloud, mainly changing file access to Minio

- Improvements on RAG logging, and some debugging in that area
This commit is contained in:
Josako
2024-08-01 17:35:54 +02:00
parent 88ca04136d
commit 64cf8df3a9
19 changed files with 617 additions and 206 deletions

View File

@@ -3,7 +3,7 @@ import logging.config
from flask import Flask
from common.utils.celery_utils import make_celery, init_celery
from common.extensions import db
from common.extensions import db, minio_client
from config.logging_config import LOGGING
@@ -33,6 +33,7 @@ def create_app(config_file=None):
def register_extensions(app):
db.init_app(app)
minio_client.init_app(app)
app, celery = create_app()

View File

@@ -1,3 +1,4 @@
import io
import os
from datetime import datetime as dt, timezone as tz
import subprocess
@@ -21,7 +22,7 @@ import PyPDF2
from pydub import AudioSegment
import tempfile
from common.extensions import db
from common.extensions import db, minio_client
from common.models.document import DocumentVersion, Embedding
from common.models.user import Tenant
from common.utils.celery_utils import current_celery
@@ -32,11 +33,6 @@ from common.utils.os_utils import safe_remove, sync_folder
@current_celery.task(name='create_embeddings', queue='embeddings')
def create_embeddings(tenant_id, document_version_id):
# Setup Remote Debugging only if PYCHARM_DEBUG=True
if current_app.config['PYCHARM_DEBUG']:
import pydevd_pycharm
pydevd_pycharm.settrace('localhost', port=50170, stdoutToServer=True, stderrToServer=True)
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.')
try:
@@ -50,6 +46,7 @@ def create_embeddings(tenant_id, document_version_id):
# Select variables to work with depending on tenant and model
model_variables = select_model_variables(tenant)
current_app.logger.debug(f'Model variables: {model_variables}')
# Retrieve document version to process
document_version = DocumentVersion.query.get(document_version_id)
@@ -107,33 +104,20 @@ def create_embeddings(tenant_id, document_version_id):
def process_pdf(tenant, model_variables, document_version):
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location)
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location,
document_version.file_name)
if os.path.exists(file_path):
pdf_text = ''
# Function to extract text from PDF and return as string
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
pdf_text += page.extract_text()
else:
current_app.logger.error(f'The physical file for document version {document_version.id} '
f'for tenant {tenant.id} '
f'at {file_path} does not exist')
create_embeddings.update_state(state=states.FAILURE)
raise
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, document_version.file_name)
pdf_text = ''
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
for page in pdf_reader.pages:
pdf_text += page.extract_text()
markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
markdown_file_name = f'{document_version.id}.md'
output_file = os.path.join(base_path, markdown_file_name)
with open(output_file, 'w') as f:
f.write(markdown)
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
markdown_file_name, markdown.encode())
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
@@ -175,43 +159,29 @@ def delete_embeddings_for_document_version(document_version):
def process_html(tenant, model_variables, document_version):
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, document_version.file_name)
html_content = file_data.decode('utf-8')
# The tags to be considered can be dependent on the tenant
html_tags = model_variables['html_tags']
html_end_tags = model_variables['html_end_tags']
html_included_elements = model_variables['html_included_elements']
html_excluded_elements = model_variables['html_excluded_elements']
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location)
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location,
document_version.file_name)
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
html_content = f.read()
else:
current_app.logger.error(f'The physical file for document version {document_version.id} '
f'for tenant {tenant.id} '
f'at {file_path} does not exist')
create_embeddings.update_state(state=states.FAILURE)
raise
extracted_html, title = parse_html(tenant, html_content, html_tags, included_elements=html_included_elements,
excluded_elements=html_excluded_elements)
extracted_file_name = f'{document_version.id}-extracted.html'
output_file = os.path.join(base_path, extracted_file_name)
with open(output_file, 'w') as f:
f.write(extracted_html)
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
extracted_file_name, extracted_html.encode())
markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
markdown_file_name = f'{document_version.id}.md'
output_file = os.path.join(base_path, markdown_file_name)
with open(output_file, 'w') as f:
f.write(markdown)
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
markdown_file_name, markdown.encode())
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
@@ -222,7 +192,7 @@ def process_html(tenant, model_variables, document_version):
else:
document_version.system_context = (f'Title: {title}\n')
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
try:
@@ -241,16 +211,17 @@ def process_html(tenant, model_variables, document_version):
f'on document version {document_version.id} :-)')
def enrich_chunks(tenant, document_version, chunks):
def enrich_chunks(tenant, document_version, title, chunks):
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
current_app.logger.debug(f'Nr of chunks: {len(chunks)}')
chunk_total_context = (f'Filename: {document_version.file_name}\n'
f'User Context:{document_version.user_context}\n'
f'User Context:\n{document_version.user_context}\n\n'
f'{document_version.system_context}\n\n')
enriched_chunks = []
initial_chunk = (f'Filename: {document_version.file_name}\n'
f'User Context:\n{document_version.user_context}\n\n'
f'Title: {title}\n'
f'{chunks[0]}')
enriched_chunks.append(initial_chunk)
@@ -311,7 +282,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
text_to_summarize = doc_creator.create_documents(chunk)
try:
summary = chain.run(text_to_summarize)
summary = chain.invoke({"text": text_to_summarize})
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}.')
return summary
@@ -391,23 +362,26 @@ def process_youtube(tenant, model_variables, document_version):
markdown_file_name = f'{document_version.id}.md'
# Remove existing files (in case of a re-processing of the file
safe_remove(os.path.join(base_path, download_file_name))
safe_remove(os.path.join(base_path, compressed_file_name))
safe_remove(os.path.join(base_path, transcription_file_name))
safe_remove(os.path.join(base_path, markdown_file_name))
sync_folder(base_path)
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, download_file_name)
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, compressed_file_name)
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, transcription_file_name)
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, markdown_file_name)
of, title, description, author = download_youtube(document_version.url, base_path, download_file_name, tenant)
of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
download_file_name)
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
compress_audio(base_path, download_file_name, compressed_file_name, tenant)
transcribe_audio(base_path, compressed_file_name, transcription_file_name,
document_version.language, tenant, model_variables)
annotate_transcription(base_path, transcription_file_name, markdown_file_name,
document_version.language, tenant, model_variables)
compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
@@ -427,83 +401,72 @@ def process_youtube(tenant, model_variables, document_version):
f'on Youtube document version {document_version.id} :-)')
def download_youtube(url, file_location, file_name, tenant):
def download_youtube(url, tenant_id, document_version, file_name):
try:
current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
yt = YouTube(url)
stream = yt.streams.get_audio_only()
output_file = stream.download(output_path=file_location, filename=file_name)
current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
return output_file, yt.title, yt.description, yt.author
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
stream.download(output_path=temp_file.name)
with open(temp_file.name, 'rb') as f:
file_data = f.read()
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
file_name, file_data)
current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
return file_name, yt.title, yt.description, yt.author
except Exception as e:
current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for '
f'tenant: {tenant.id} with error: {e}')
current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
raise
def compress_audio(file_location, input_file, output_file, tenant):
def compress_audio(tenant_id, document_version, input_file, output_file):
try:
current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
# Run the compression script
result = subprocess.run(
['scripts/compress.sh', '-d', file_location, '-i', input_file, '-o', output_file],
capture_output=True,
text=True
)
input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id, input_file)
if result.returncode != 0:
raise Exception(f"Compression failed: {result.stderr}")
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
temp_input.write(input_data)
temp_input.flush()
output_file_path = os.path.join(file_location, output_file)
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
result = subprocess.run(
['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
capture_output=True,
text=True
)
# Additional check for file stability
previous_size = -1
stable_count = 0
max_attempts = 12 # 1 minute total wait time
if result.returncode != 0:
raise Exception(f"Compression failed: {result.stderr}")
for _ in range(max_attempts):
if os.path.exists(output_file_path):
current_size = os.path.getsize(output_file_path)
if current_size == previous_size:
stable_count += 1
if stable_count >= 3: # File size hasn't changed for 3 checks
break
else:
stable_count = 0
previous_size = current_size
gevent.sleep(5)
with open(temp_output.name, 'rb') as f:
compressed_data = f.read()
if stable_count < 3:
raise Exception("File size did not stabilize within the expected time")
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
output_file, compressed_data)
current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
return output_file_path
current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
except Exception as e:
current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
raise
def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables):
def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
try:
current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}')
current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
client = model_variables['transcription_client']
model = model_variables['transcription_model']
input_file_path = os.path.join(file_location, input_file)
output_file_path = os.path.join(file_location, output_file)
# Wait for the input file to exist
count = 0
while not os.path.exists(input_file_path) and count < 10:
gevent.sleep(1)
current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
count += 1
# Download the audio file from MinIO
audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id, input_file)
if not os.path.exists(input_file_path):
raise FileNotFoundError(f"Input file {input_file_path} not found after waiting.")
# Load the audio file
audio = AudioSegment.from_file(input_file_path)
# Load the audio data into pydub
audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
# Define segment length (e.g., 10 minutes)
segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
@@ -512,14 +475,16 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
# Split audio into segments and transcribe each
for i, chunk in enumerate(audio[::segment_length]):
current_app.logger.debug(f'Transcribing chunk {i} of {len(audio) // segment_length} ')
current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
chunk.export(temp_audio.name, format="mp3")
with open(temp_audio.name, 'rb') as audio_segment:
transcription = client.audio.transcriptions.create(
file=audio_segment,
model=model,
language=language,
language=document_version.language,
response_format='verbose_json',
)
@@ -530,20 +495,25 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
# Combine all transcriptions
full_transcription = " ".join(transcriptions)
# Write the full transcription to the output file
with open(output_file_path, 'w') as f:
f.write(full_transcription)
# Upload the full transcription to MinIO
minio_client.upload_document_file(
tenant_id,
document_version.doc_id,
document_version.language,
document_version.id,
output_file,
full_transcription.encode('utf-8')
)
current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
except Exception as e:
current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, '
f'with error: {e}')
current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
raise
def annotate_transcription(file_location, input_file, output_file, language, tenant, model_variables):
def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
try:
current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')
char_splitter = CharacterTextSplitter(separator='.',
chunk_size=model_variables['annotation_chunk_length'],
@@ -552,18 +522,21 @@ def annotate_transcription(file_location, input_file, output_file, language, ten
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
# ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
llm = model_variables['llm']
template = model_variables['transcript_template']
language_template = create_language_template(template, language)
language_template = create_language_template(template, document_version.language)
transcript_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
with open(os.path.join(file_location, input_file), 'r') as f:
transcript = f.read()
# Download the transcription file from MinIO
transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
document_version.language, document_version.id,
input_file)
transcript = transcript_data.decode('utf-8')
chain = setup | transcript_prompt | llm | output_parser
@@ -598,38 +571,53 @@ def annotate_transcription(file_location, input_file, output_file, language, ten
markdown_chunks.pop()
all_markdown_chunks += markdown_chunks
all_markdown_chunks += [last_markdown_chunk]
annotated_transcript = '\n'.join(all_markdown_chunks)
with open(os.path.join(file_location, output_file), 'w') as f:
f.write(annotated_transcript)
# Upload the annotated transcript to MinIO
minio_client.upload_document_file(
tenant.id,
document_version.doc_id,
document_version.language,
document_version.id,
output_file,
annotated_transcript.encode('utf-8')
)
current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}')
current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
except Exception as e:
current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, '
f'with error: {e}')
current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
raise
def create_potential_chunks_for_markdown(base_path, input_file, tenant):
current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}')
markdown = ''
with open(os.path.join(base_path, input_file), 'r') as f:
markdown = f.read()
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
try:
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
# ("###", "Header 3"),
]
# Download the markdown file from MinIO
markdown_data = minio_client.download_document_file(tenant_id,
document_version.doc_id,
document_version.language,
document_version.id,
input_file
)
markdown = markdown_data.decode('utf-8')
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits]
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
]
return potential_chunks
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits]
current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
return potential_chunks
except Exception as e:
current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
raise
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):