- Improvements to enable deployment in the cloud, mainly changing file access to Minio
- Improvements on RAG logging, and some debugging in that area
This commit is contained in:
@@ -3,7 +3,7 @@ import logging.config
|
||||
from flask import Flask
|
||||
|
||||
from common.utils.celery_utils import make_celery, init_celery
|
||||
from common.extensions import db
|
||||
from common.extensions import db, minio_client
|
||||
from config.logging_config import LOGGING
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ def create_app(config_file=None):
|
||||
|
||||
def register_extensions(app):
|
||||
db.init_app(app)
|
||||
minio_client.init_app(app)
|
||||
|
||||
|
||||
app, celery = create_app()
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import io
|
||||
import os
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
import subprocess
|
||||
@@ -21,7 +22,7 @@ import PyPDF2
|
||||
from pydub import AudioSegment
|
||||
import tempfile
|
||||
|
||||
from common.extensions import db
|
||||
from common.extensions import db, minio_client
|
||||
from common.models.document import DocumentVersion, Embedding
|
||||
from common.models.user import Tenant
|
||||
from common.utils.celery_utils import current_celery
|
||||
@@ -32,11 +33,6 @@ from common.utils.os_utils import safe_remove, sync_folder
|
||||
|
||||
@current_celery.task(name='create_embeddings', queue='embeddings')
|
||||
def create_embeddings(tenant_id, document_version_id):
|
||||
# Setup Remote Debugging only if PYCHARM_DEBUG=True
|
||||
if current_app.config['PYCHARM_DEBUG']:
|
||||
import pydevd_pycharm
|
||||
pydevd_pycharm.settrace('localhost', port=50170, stdoutToServer=True, stderrToServer=True)
|
||||
|
||||
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.')
|
||||
|
||||
try:
|
||||
@@ -50,6 +46,7 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
|
||||
# Select variables to work with depending on tenant and model
|
||||
model_variables = select_model_variables(tenant)
|
||||
current_app.logger.debug(f'Model variables: {model_variables}')
|
||||
|
||||
# Retrieve document version to process
|
||||
document_version = DocumentVersion.query.get(document_version_id)
|
||||
@@ -107,33 +104,20 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
|
||||
|
||||
def process_pdf(tenant, model_variables, document_version):
|
||||
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location)
|
||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location,
|
||||
document_version.file_name)
|
||||
if os.path.exists(file_path):
|
||||
pdf_text = ''
|
||||
# Function to extract text from PDF and return as string
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
for page_num in range(len(reader.pages)):
|
||||
page = reader.pages[page_num]
|
||||
pdf_text += page.extract_text()
|
||||
else:
|
||||
current_app.logger.error(f'The physical file for document version {document_version.id} '
|
||||
f'for tenant {tenant.id} '
|
||||
f'at {file_path} does not exist')
|
||||
create_embeddings.update_state(state=states.FAILURE)
|
||||
raise
|
||||
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, document_version.file_name)
|
||||
|
||||
pdf_text = ''
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
|
||||
for page in pdf_reader.pages:
|
||||
pdf_text += page.extract_text()
|
||||
|
||||
markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
output_file = os.path.join(base_path, markdown_file_name)
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(markdown)
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
|
||||
@@ -175,43 +159,29 @@ def delete_embeddings_for_document_version(document_version):
|
||||
|
||||
|
||||
def process_html(tenant, model_variables, document_version):
|
||||
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, document_version.file_name)
|
||||
html_content = file_data.decode('utf-8')
|
||||
|
||||
# The tags to be considered can be dependent on the tenant
|
||||
html_tags = model_variables['html_tags']
|
||||
html_end_tags = model_variables['html_end_tags']
|
||||
html_included_elements = model_variables['html_included_elements']
|
||||
html_excluded_elements = model_variables['html_excluded_elements']
|
||||
|
||||
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location)
|
||||
|
||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location,
|
||||
document_version.file_name)
|
||||
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'rb') as f:
|
||||
html_content = f.read()
|
||||
else:
|
||||
current_app.logger.error(f'The physical file for document version {document_version.id} '
|
||||
f'for tenant {tenant.id} '
|
||||
f'at {file_path} does not exist')
|
||||
create_embeddings.update_state(state=states.FAILURE)
|
||||
raise
|
||||
|
||||
extracted_html, title = parse_html(tenant, html_content, html_tags, included_elements=html_included_elements,
|
||||
excluded_elements=html_excluded_elements)
|
||||
|
||||
extracted_file_name = f'{document_version.id}-extracted.html'
|
||||
output_file = os.path.join(base_path, extracted_file_name)
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(extracted_html)
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
extracted_file_name, extracted_html.encode())
|
||||
|
||||
markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
output_file = os.path.join(base_path, markdown_file_name)
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(markdown)
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
|
||||
@@ -222,7 +192,7 @@ def process_html(tenant, model_variables, document_version):
|
||||
else:
|
||||
document_version.system_context = (f'Title: {title}\n')
|
||||
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
try:
|
||||
@@ -241,16 +211,17 @@ def process_html(tenant, model_variables, document_version):
|
||||
f'on document version {document_version.id} :-)')
|
||||
|
||||
|
||||
def enrich_chunks(tenant, document_version, chunks):
|
||||
def enrich_chunks(tenant, document_version, title, chunks):
|
||||
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
current_app.logger.debug(f'Nr of chunks: {len(chunks)}')
|
||||
chunk_total_context = (f'Filename: {document_version.file_name}\n'
|
||||
f'User Context:{document_version.user_context}\n'
|
||||
f'User Context:\n{document_version.user_context}\n\n'
|
||||
f'{document_version.system_context}\n\n')
|
||||
enriched_chunks = []
|
||||
initial_chunk = (f'Filename: {document_version.file_name}\n'
|
||||
f'User Context:\n{document_version.user_context}\n\n'
|
||||
f'Title: {title}\n'
|
||||
f'{chunks[0]}')
|
||||
|
||||
enriched_chunks.append(initial_chunk)
|
||||
@@ -311,7 +282,7 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
text_to_summarize = doc_creator.create_documents(chunk)
|
||||
|
||||
try:
|
||||
summary = chain.run(text_to_summarize)
|
||||
summary = chain.invoke({"text": text_to_summarize})
|
||||
current_app.logger.debug(f'Finished summarizing chunk for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}.')
|
||||
return summary
|
||||
@@ -391,23 +362,26 @@ def process_youtube(tenant, model_variables, document_version):
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
|
||||
# Remove existing files (in case of a re-processing of the file
|
||||
safe_remove(os.path.join(base_path, download_file_name))
|
||||
safe_remove(os.path.join(base_path, compressed_file_name))
|
||||
safe_remove(os.path.join(base_path, transcription_file_name))
|
||||
safe_remove(os.path.join(base_path, markdown_file_name))
|
||||
sync_folder(base_path)
|
||||
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, download_file_name)
|
||||
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, compressed_file_name)
|
||||
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, transcription_file_name)
|
||||
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, markdown_file_name)
|
||||
|
||||
of, title, description, author = download_youtube(document_version.url, base_path, download_file_name, tenant)
|
||||
of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
|
||||
download_file_name)
|
||||
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
|
||||
compress_audio(base_path, download_file_name, compressed_file_name, tenant)
|
||||
transcribe_audio(base_path, compressed_file_name, transcription_file_name,
|
||||
document_version.language, tenant, model_variables)
|
||||
annotate_transcription(base_path, transcription_file_name, markdown_file_name,
|
||||
document_version.language, tenant, model_variables)
|
||||
compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
|
||||
transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
|
||||
annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
@@ -427,83 +401,72 @@ def process_youtube(tenant, model_variables, document_version):
|
||||
f'on Youtube document version {document_version.id} :-)')
|
||||
|
||||
|
||||
def download_youtube(url, file_location, file_name, tenant):
|
||||
def download_youtube(url, tenant_id, document_version, file_name):
|
||||
try:
|
||||
current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
|
||||
current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
|
||||
yt = YouTube(url)
|
||||
stream = yt.streams.get_audio_only()
|
||||
output_file = stream.download(output_path=file_location, filename=file_name)
|
||||
current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
|
||||
return output_file, yt.title, yt.description, yt.author
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
stream.download(output_path=temp_file.name)
|
||||
with open(temp_file.name, 'rb') as f:
|
||||
file_data = f.read()
|
||||
|
||||
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
|
||||
file_name, file_data)
|
||||
|
||||
current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
|
||||
return file_name, yt.title, yt.description, yt.author
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for '
|
||||
f'tenant: {tenant.id} with error: {e}')
|
||||
current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def compress_audio(file_location, input_file, output_file, tenant):
|
||||
def compress_audio(tenant_id, document_version, input_file, output_file):
|
||||
try:
|
||||
current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
|
||||
current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
|
||||
|
||||
# Run the compression script
|
||||
result = subprocess.run(
|
||||
['scripts/compress.sh', '-d', file_location, '-i', input_file, '-o', output_file],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
|
||||
document_version.id, input_file)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"Compression failed: {result.stderr}")
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
|
||||
temp_input.write(input_data)
|
||||
temp_input.flush()
|
||||
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
|
||||
result = subprocess.run(
|
||||
['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
# Additional check for file stability
|
||||
previous_size = -1
|
||||
stable_count = 0
|
||||
max_attempts = 12 # 1 minute total wait time
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"Compression failed: {result.stderr}")
|
||||
|
||||
for _ in range(max_attempts):
|
||||
if os.path.exists(output_file_path):
|
||||
current_size = os.path.getsize(output_file_path)
|
||||
if current_size == previous_size:
|
||||
stable_count += 1
|
||||
if stable_count >= 3: # File size hasn't changed for 3 checks
|
||||
break
|
||||
else:
|
||||
stable_count = 0
|
||||
previous_size = current_size
|
||||
gevent.sleep(5)
|
||||
with open(temp_output.name, 'rb') as f:
|
||||
compressed_data = f.read()
|
||||
|
||||
if stable_count < 3:
|
||||
raise Exception("File size did not stabilize within the expected time")
|
||||
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
|
||||
output_file, compressed_data)
|
||||
|
||||
current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
|
||||
return output_file_path
|
||||
current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
|
||||
current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables):
|
||||
def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
|
||||
try:
|
||||
current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}')
|
||||
current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
|
||||
client = model_variables['transcription_client']
|
||||
model = model_variables['transcription_model']
|
||||
input_file_path = os.path.join(file_location, input_file)
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
|
||||
# Wait for the input file to exist
|
||||
count = 0
|
||||
while not os.path.exists(input_file_path) and count < 10:
|
||||
gevent.sleep(1)
|
||||
current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
|
||||
count += 1
|
||||
# Download the audio file from MinIO
|
||||
audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
|
||||
document_version.id, input_file)
|
||||
|
||||
if not os.path.exists(input_file_path):
|
||||
raise FileNotFoundError(f"Input file {input_file_path} not found after waiting.")
|
||||
|
||||
# Load the audio file
|
||||
audio = AudioSegment.from_file(input_file_path)
|
||||
# Load the audio data into pydub
|
||||
audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
|
||||
|
||||
# Define segment length (e.g., 10 minutes)
|
||||
segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
|
||||
@@ -512,14 +475,16 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
|
||||
|
||||
# Split audio into segments and transcribe each
|
||||
for i, chunk in enumerate(audio[::segment_length]):
|
||||
current_app.logger.debug(f'Transcribing chunk {i} of {len(audio) // segment_length} ')
|
||||
current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
|
||||
chunk.export(temp_audio.name, format="mp3")
|
||||
|
||||
with open(temp_audio.name, 'rb') as audio_segment:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
file=audio_segment,
|
||||
model=model,
|
||||
language=language,
|
||||
language=document_version.language,
|
||||
response_format='verbose_json',
|
||||
)
|
||||
|
||||
@@ -530,20 +495,25 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
|
||||
# Combine all transcriptions
|
||||
full_transcription = " ".join(transcriptions)
|
||||
|
||||
# Write the full transcription to the output file
|
||||
with open(output_file_path, 'w') as f:
|
||||
f.write(full_transcription)
|
||||
# Upload the full transcription to MinIO
|
||||
minio_client.upload_document_file(
|
||||
tenant_id,
|
||||
document_version.doc_id,
|
||||
document_version.language,
|
||||
document_version.id,
|
||||
output_file,
|
||||
full_transcription.encode('utf-8')
|
||||
)
|
||||
|
||||
current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
|
||||
current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, '
|
||||
f'with error: {e}')
|
||||
current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def annotate_transcription(file_location, input_file, output_file, language, tenant, model_variables):
|
||||
def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
|
||||
try:
|
||||
current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
|
||||
current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')
|
||||
|
||||
char_splitter = CharacterTextSplitter(separator='.',
|
||||
chunk_size=model_variables['annotation_chunk_length'],
|
||||
@@ -552,18 +522,21 @@ def annotate_transcription(file_location, input_file, output_file, language, ten
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
# ("###", "Header 3"),
|
||||
]
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
|
||||
llm = model_variables['llm']
|
||||
template = model_variables['transcript_template']
|
||||
language_template = create_language_template(template, language)
|
||||
language_template = create_language_template(template, document_version.language)
|
||||
transcript_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
with open(os.path.join(file_location, input_file), 'r') as f:
|
||||
transcript = f.read()
|
||||
|
||||
# Download the transcription file from MinIO
|
||||
transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
|
||||
document_version.language, document_version.id,
|
||||
input_file)
|
||||
transcript = transcript_data.decode('utf-8')
|
||||
|
||||
chain = setup | transcript_prompt | llm | output_parser
|
||||
|
||||
@@ -598,38 +571,53 @@ def annotate_transcription(file_location, input_file, output_file, language, ten
|
||||
markdown_chunks.pop()
|
||||
all_markdown_chunks += markdown_chunks
|
||||
|
||||
|
||||
all_markdown_chunks += [last_markdown_chunk]
|
||||
|
||||
annotated_transcript = '\n'.join(all_markdown_chunks)
|
||||
|
||||
with open(os.path.join(file_location, output_file), 'w') as f:
|
||||
f.write(annotated_transcript)
|
||||
# Upload the annotated transcript to MinIO
|
||||
minio_client.upload_document_file(
|
||||
tenant.id,
|
||||
document_version.doc_id,
|
||||
document_version.language,
|
||||
document_version.id,
|
||||
output_file,
|
||||
annotated_transcript.encode('utf-8')
|
||||
)
|
||||
|
||||
current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}')
|
||||
current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, '
|
||||
f'with error: {e}')
|
||||
current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def create_potential_chunks_for_markdown(base_path, input_file, tenant):
|
||||
current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}')
|
||||
markdown = ''
|
||||
with open(os.path.join(base_path, input_file), 'r') as f:
|
||||
markdown = f.read()
|
||||
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
|
||||
try:
|
||||
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
# ("###", "Header 3"),
|
||||
]
|
||||
# Download the markdown file from MinIO
|
||||
markdown_data = minio_client.download_document_file(tenant_id,
|
||||
document_version.doc_id,
|
||||
document_version.language,
|
||||
document_version.id,
|
||||
input_file
|
||||
)
|
||||
markdown = markdown_data.decode('utf-8')
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
]
|
||||
|
||||
return potential_chunks
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||
|
||||
current_app.logger.debug(f'Created {len(potential_chunks)} potential chunks for tenant {tenant_id}')
|
||||
return potential_chunks
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
|
||||
Reference in New Issue
Block a user