Removing DocumentLanguage, as both System Context and User Context are to be defined on DocumentVersion level.
Finetuning of embedding workers.
This commit is contained in:
@@ -130,13 +130,11 @@ def process_pdf(tenant, model_variables, document_version):
|
||||
raise
|
||||
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
doc_lang = document_version.document_language
|
||||
doc_lang.system_context = f'Summary: {summary}\n'
|
||||
document_version.system_context = f'Summary: {summary}\n'
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
try:
|
||||
db.session.add(doc_lang)
|
||||
db.session.add(document_version)
|
||||
document_version.processing_finished_at = dt.now(tz.utc)
|
||||
document_version.processing = False
|
||||
@@ -156,10 +154,10 @@ def process_pdf(tenant, model_variables, document_version):
|
||||
|
||||
def process_html(tenant, model_variables, document_version):
|
||||
# The tags to be considered can be dependent on the tenant
|
||||
html_tags = tenant.html_tags
|
||||
end_tags = tenant.html_end_tags
|
||||
included_elements = tenant.html_included_elements
|
||||
excluded_elements = tenant.html_excluded_elements
|
||||
html_tags = model_variables['html_tags']
|
||||
html_end_tags = model_variables['html_end_tags']
|
||||
html_included_elements = model_variables['html_included_elements']
|
||||
html_excluded_elements = model_variables['html_excluded_elements']
|
||||
|
||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location,
|
||||
@@ -175,23 +173,25 @@ def process_html(tenant, model_variables, document_version):
|
||||
create_embeddings.update_state(state=states.FAILURE)
|
||||
raise
|
||||
|
||||
extracted_data, title = parse_html(html_content, html_tags, included_elements=included_elements,
|
||||
excluded_elements=excluded_elements)
|
||||
potential_chunks = create_potential_chunks(extracted_data, end_tags)
|
||||
extracted_data, title = parse_html(html_content, html_tags, included_elements=html_included_elements,
|
||||
excluded_elements=html_excluded_elements)
|
||||
potential_chunks = create_potential_chunks(extracted_data, html_end_tags)
|
||||
chunks = combine_chunks(potential_chunks,
|
||||
model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size']
|
||||
)
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
doc_lang = document_version.document_language
|
||||
doc_lang.system_context = (f'Title: {title}\n'
|
||||
f'Summary: {summary}\n')
|
||||
|
||||
if len(chunks) > 0:
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
document_version.system_context = (f'Title: {title}\n'
|
||||
f'Summary: {summary}\n')
|
||||
else:
|
||||
document_version.system_context = (f'Title: {title}\n')
|
||||
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
try:
|
||||
db.session.add(doc_lang)
|
||||
db.session.add(document_version)
|
||||
document_version.processing_finished_at = dt.now(tz.utc)
|
||||
document_version.processing = False
|
||||
@@ -210,12 +210,14 @@ def process_html(tenant, model_variables, document_version):
|
||||
def enrich_chunks(tenant, document_version, chunks):
|
||||
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
doc_lang = document_version.document_language
|
||||
chunk_total_context = (f'Filename: {document_version.file_name}\n'
|
||||
f'{doc_lang.system_context}\n'
|
||||
f'User Context:\n{doc_lang.user_context}')
|
||||
f'User Context:{document_version.user_context}\n'
|
||||
f'{document_version.system_context}\n\n')
|
||||
enriched_chunks = []
|
||||
initial_chunk = f'Filename: {document_version.file_name}\n User Context:\n{doc_lang.user_context}\n{chunks[0]}'
|
||||
initial_chunk = (f'Filename: {document_version.file_name}\n'
|
||||
f'User Context:\n{document_version.user_context}\n\n'
|
||||
f'{chunks[0]}')
|
||||
|
||||
enriched_chunks.append(initial_chunk)
|
||||
for chunk in chunks[1:]:
|
||||
enriched_chunk = f'{chunk_total_context}\n{chunk}'
|
||||
@@ -313,6 +315,12 @@ def parse_html(html_content, tags, included_elements=None, excluded_elements=Non
|
||||
else:
|
||||
elements_to_parse = [soup] # parse the entire document if no included_elements specified
|
||||
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
|
||||
current_app.embed_tuning_logger.debug(f'{elements_to_parse}')
|
||||
|
||||
# Iterate through the found included elements
|
||||
for element in elements_to_parse:
|
||||
# Find all specified tags within each included element
|
||||
|
||||
Reference in New Issue
Block a user