- Allow for more complex and longer PDFs to be uploaded to Evie. First implmentation of a processor for specific file types.

- Allow URLs to contain other information than just HTML information. It can alose refer to e.g. PDF-files.
2024-08-27 07:05:56 +02:00
parent 2ca006d82c
commit 122d1a18df
9 changed files with 458 additions and 86 deletions
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -29,6 +29,7 @@ from common.utils.celery_utils import current_celery
 from common.utils.database import Database
 from common.utils.model_utils import select_model_variables, create_language_template
 from common.utils.os_utils import safe_remove, sync_folder
+from eveai_workers.Processors.PDF_Processor import PDFProcessor


@current_celery.task(name='create_embeddings', queue='embeddings')
@@ -103,34 +104,67 @@ def create_embeddings(tenant_id, document_version_id):
        raise


+# def process_pdf(tenant, model_variables, document_version):
+#     file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
+#                                                     document_version.id, document_version.file_name)
+#
+#     pdf_text = ''
+#     pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
+#     for page in pdf_reader.pages:
+#         pdf_text += page.extract_text()
+#
+#     markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
+#     markdown_file_name = f'{document_version.id}.md'
+#     minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
+#                                       document_version.id,
+#                                       markdown_file_name, markdown.encode())
+#
+#     potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
+#     chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
+#                                          model_variables['max_chunk_size'])
+#
+#     if len(chunks) > 1:
+#         summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
+#         document_version.system_context = f'Summary: {summary}\n'
+#     else:
+#         document_version.system_context = ''
+#
+#     enriched_chunks = enrich_chunks(tenant, document_version, chunks)
+#     embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
+#
+#     try:
+#         db.session.add(document_version)
+#         document_version.processing_finished_at = dt.now(tz.utc)
+#         document_version.processing = False
+#         db.session.add_all(embeddings)
+#         db.session.commit()
+#     except SQLAlchemyError as e:
+#         current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
+#                                  f'on HTML, document version {document_version.id}'
+#                                  f'error: {e}')
+#         raise
+#
+#     current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
+#                             f'on document version {document_version.id} :-)')
+
 def process_pdf(tenant, model_variables, document_version):
-    file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                                    document_version.id, document_version.file_name)
+    processor = PDFProcessor(tenant, model_variables, document_version)
+    markdown, title = processor.process_pdf()

-    pdf_text = ''
-    pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
-    for page in pdf_reader.pages:
-        pdf_text += page.extract_text()
+    # Create potential chunks for embedding
+    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")

-    markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
-    markdown_file_name = f'{document_version.id}.md'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                      document_version.id,
-                                      markdown_file_name, markdown.encode())
-
-    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
+    # Combine chunks for embedding
    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
                                         model_variables['max_chunk_size'])

-    if len(chunks) > 1:
-        summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
-        document_version.system_context = f'Summary: {summary}\n'
-    else:
-        document_version.system_context = ''
+    # Enrich chunks
+    enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)

-    enriched_chunks = enrich_chunks(tenant, document_version, chunks)
+    # Create embeddings
    embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)

+    # Update document version and save embeddings
    try:
        db.session.add(document_version)
        document_version.processing_finished_at = dt.now(tz.utc)
@@ -139,7 +173,7 @@ def process_pdf(tenant, model_variables, document_version):
        db.session.commit()
    except SQLAlchemyError as e:
        current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
-                                 f'on HTML, document version {document_version.id}'
+                                 f'on PDF, document version {document_version.id}'
                                 f'error: {e}')
        raise

@@ -238,26 +272,6 @@ def enrich_chunks(tenant, document_version, title, chunks):
    return enriched_chunks


-# def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
-#     current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
-#                              f'on document version {document_version.id}')
-#     llm = model_variables['llm']
-#     template = model_variables['html_parse_template']
-#     parse_prompt = ChatPromptTemplate.from_template(template)
-#     setup = RunnablePassthrough()
-#     output_parser = StrOutputParser()
-#
-#     chain = setup | parse_prompt | llm | output_parser
-#     input_html = {"html": html_content}
-#
-#     markdown = chain.invoke(input_html)
-#
-#     current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
-#                              f'on document version {document_version.id}')
-#
-#     return markdown
-
-
 def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
    current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
                             f'on document version {document_version.id}')
@@ -765,4 +779,4 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
        actual_chunks.append(current_chunk)

    return actual_chunks
-    pass
+