- Allow for more complex and longer PDFs to be uploaded to Evie. First implmentation of a processor for specific file types.
- Allow URLs to contain other information than just HTML information. It can alose refer to e.g. PDF-files.
This commit is contained in:
@@ -29,6 +29,7 @@ from common.utils.celery_utils import current_celery
|
||||
from common.utils.database import Database
|
||||
from common.utils.model_utils import select_model_variables, create_language_template
|
||||
from common.utils.os_utils import safe_remove, sync_folder
|
||||
from eveai_workers.Processors.PDF_Processor import PDFProcessor
|
||||
|
||||
|
||||
@current_celery.task(name='create_embeddings', queue='embeddings')
|
||||
@@ -103,34 +104,67 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
raise
|
||||
|
||||
|
||||
# def process_pdf(tenant, model_variables, document_version):
|
||||
# file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
# document_version.id, document_version.file_name)
|
||||
#
|
||||
# pdf_text = ''
|
||||
# pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
|
||||
# for page in pdf_reader.pages:
|
||||
# pdf_text += page.extract_text()
|
||||
#
|
||||
# markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
|
||||
# markdown_file_name = f'{document_version.id}.md'
|
||||
# minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
# document_version.id,
|
||||
# markdown_file_name, markdown.encode())
|
||||
#
|
||||
# potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
# chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
# model_variables['max_chunk_size'])
|
||||
#
|
||||
# if len(chunks) > 1:
|
||||
# summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
# document_version.system_context = f'Summary: {summary}\n'
|
||||
# else:
|
||||
# document_version.system_context = ''
|
||||
#
|
||||
# enriched_chunks = enrich_chunks(tenant, document_version, chunks)
|
||||
# embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
#
|
||||
# try:
|
||||
# db.session.add(document_version)
|
||||
# document_version.processing_finished_at = dt.now(tz.utc)
|
||||
# document_version.processing = False
|
||||
# db.session.add_all(embeddings)
|
||||
# db.session.commit()
|
||||
# except SQLAlchemyError as e:
|
||||
# current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
|
||||
# f'on HTML, document version {document_version.id}'
|
||||
# f'error: {e}')
|
||||
# raise
|
||||
#
|
||||
# current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
|
||||
# f'on document version {document_version.id} :-)')
|
||||
|
||||
def process_pdf(tenant, model_variables, document_version):
|
||||
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, document_version.file_name)
|
||||
processor = PDFProcessor(tenant, model_variables, document_version)
|
||||
markdown, title = processor.process_pdf()
|
||||
|
||||
pdf_text = ''
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
|
||||
for page in pdf_reader.pages:
|
||||
pdf_text += page.extract_text()
|
||||
# Create potential chunks for embedding
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
|
||||
|
||||
markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
# Combine chunks for embedding
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
|
||||
if len(chunks) > 1:
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
document_version.system_context = f'Summary: {summary}\n'
|
||||
else:
|
||||
document_version.system_context = ''
|
||||
# Enrich chunks
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
|
||||
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
|
||||
# Create embeddings
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
# Update document version and save embeddings
|
||||
try:
|
||||
db.session.add(document_version)
|
||||
document_version.processing_finished_at = dt.now(tz.utc)
|
||||
@@ -139,7 +173,7 @@ def process_pdf(tenant, model_variables, document_version):
|
||||
db.session.commit()
|
||||
except SQLAlchemyError as e:
|
||||
current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
|
||||
f'on HTML, document version {document_version.id}'
|
||||
f'on PDF, document version {document_version.id}'
|
||||
f'error: {e}')
|
||||
raise
|
||||
|
||||
@@ -238,26 +272,6 @@ def enrich_chunks(tenant, document_version, title, chunks):
|
||||
return enriched_chunks
|
||||
|
||||
|
||||
# def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
|
||||
# current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
|
||||
# f'on document version {document_version.id}')
|
||||
# llm = model_variables['llm']
|
||||
# template = model_variables['html_parse_template']
|
||||
# parse_prompt = ChatPromptTemplate.from_template(template)
|
||||
# setup = RunnablePassthrough()
|
||||
# output_parser = StrOutputParser()
|
||||
#
|
||||
# chain = setup | parse_prompt | llm | output_parser
|
||||
# input_html = {"html": html_content}
|
||||
#
|
||||
# markdown = chain.invoke(input_html)
|
||||
#
|
||||
# current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
|
||||
# f'on document version {document_version.id}')
|
||||
#
|
||||
# return markdown
|
||||
|
||||
|
||||
def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
|
||||
current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
@@ -765,4 +779,4 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
actual_chunks.append(current_chunk)
|
||||
|
||||
return actual_chunks
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user