Removing DocumentLanguage, as both System Context and User Context are to be defined on DocumentVersion level.

Finetuning of embedding workers.
This commit is contained in:
Josako
2024-06-06 15:26:49 +02:00
parent 1a25313673
commit 27b6de8734
21 changed files with 301 additions and 295 deletions

View File

@@ -130,13 +130,11 @@ def process_pdf(tenant, model_variables, document_version):
raise
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
doc_lang = document_version.document_language
doc_lang.system_context = f'Summary: {summary}\n'
document_version.system_context = f'Summary: {summary}\n'
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
try:
db.session.add(doc_lang)
db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing = False
@@ -156,10 +154,10 @@ def process_pdf(tenant, model_variables, document_version):
def process_html(tenant, model_variables, document_version):
# The tags to be considered can be dependent on the tenant
html_tags = tenant.html_tags
end_tags = tenant.html_end_tags
included_elements = tenant.html_included_elements
excluded_elements = tenant.html_excluded_elements
html_tags = model_variables['html_tags']
html_end_tags = model_variables['html_end_tags']
html_included_elements = model_variables['html_included_elements']
html_excluded_elements = model_variables['html_excluded_elements']
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location,
@@ -175,23 +173,25 @@ def process_html(tenant, model_variables, document_version):
create_embeddings.update_state(state=states.FAILURE)
raise
extracted_data, title = parse_html(html_content, html_tags, included_elements=included_elements,
excluded_elements=excluded_elements)
potential_chunks = create_potential_chunks(extracted_data, end_tags)
extracted_data, title = parse_html(html_content, html_tags, included_elements=html_included_elements,
excluded_elements=html_excluded_elements)
potential_chunks = create_potential_chunks(extracted_data, html_end_tags)
chunks = combine_chunks(potential_chunks,
model_variables['min_chunk_size'],
model_variables['max_chunk_size']
)
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
doc_lang = document_version.document_language
doc_lang.system_context = (f'Title: {title}\n'
f'Summary: {summary}\n')
if len(chunks) > 0:
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
document_version.system_context = (f'Title: {title}\n'
f'Summary: {summary}\n')
else:
document_version.system_context = (f'Title: {title}\n')
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
try:
db.session.add(doc_lang)
db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing = False
@@ -210,12 +210,14 @@ def process_html(tenant, model_variables, document_version):
def enrich_chunks(tenant, document_version, chunks):
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
doc_lang = document_version.document_language
chunk_total_context = (f'Filename: {document_version.file_name}\n'
f'{doc_lang.system_context}\n'
f'User Context:\n{doc_lang.user_context}')
f'User Context:{document_version.user_context}\n'
f'{document_version.system_context}\n\n')
enriched_chunks = []
initial_chunk = f'Filename: {document_version.file_name}\n User Context:\n{doc_lang.user_context}\n{chunks[0]}'
initial_chunk = (f'Filename: {document_version.file_name}\n'
f'User Context:\n{document_version.user_context}\n\n'
f'{chunks[0]}')
enriched_chunks.append(initial_chunk)
for chunk in chunks[1:]:
enriched_chunk = f'{chunk_total_context}\n{chunk}'
@@ -313,6 +315,12 @@ def parse_html(html_content, tags, included_elements=None, excluded_elements=Non
else:
elements_to_parse = [soup] # parse the entire document if no included_elements specified
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
current_app.embed_tuning_logger.debug(f'{elements_to_parse}')
# Iterate through the found included elements
for element in elements_to_parse:
# Find all specified tags within each included element