Added excluded element classes to HTML parsing to allow for more complex document parsing

Added chunking to conversion of HTML to markdown in case of large files
2024-08-22 16:41:13 +02:00
parent a9f9b04117
commit 2ca006d82c
10 changed files with 181 additions and 46 deletions
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -3,7 +3,6 @@ import os
 from datetime import datetime as dt, timezone as tz
 import subprocess

-
 import gevent
 from bs4 import BeautifulSoup
 import html
@@ -12,6 +11,7 @@ from flask import current_app
 # OpenAI imports
 from langchain.chains.summarize import load_summarize_chain
 from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
+from langchain_core.documents import Document
 from langchain_core.exceptions import LangChainException
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
@@ -105,7 +105,7 @@ def create_embeddings(tenant_id, document_version_id):

 def process_pdf(tenant, model_variables, document_version):
    file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                       document_version.id, document_version.file_name)
+                                                    document_version.id, document_version.file_name)

    pdf_text = ''
    pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
@@ -114,8 +114,9 @@ def process_pdf(tenant, model_variables, document_version):

    markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
    markdown_file_name = f'{document_version.id}.md'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
-                         markdown_file_name, markdown.encode())
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id,
+                                      markdown_file_name, markdown.encode())

    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
@@ -160,7 +161,7 @@ def delete_embeddings_for_document_version(document_version):

 def process_html(tenant, model_variables, document_version):
    file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                       document_version.id, document_version.file_name)
+                                                    document_version.id, document_version.file_name)
    html_content = file_data.decode('utf-8')

    # The tags to be considered can be dependent on the tenant
@@ -173,13 +174,15 @@ def process_html(tenant, model_variables, document_version):
                                       excluded_elements=html_excluded_elements)

    extracted_file_name = f'{document_version.id}-extracted.html'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
-                         extracted_file_name, extracted_html.encode())
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id,
+                                      extracted_file_name, extracted_html.encode())

    markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
    markdown_file_name = f'{document_version.id}.md'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
-                         markdown_file_name, markdown.encode())
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id,
+                                      markdown_file_name, markdown.encode())

    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
@@ -235,19 +238,94 @@ def enrich_chunks(tenant, document_version, title, chunks):
    return enriched_chunks


+# def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
+#     current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
+#                              f'on document version {document_version.id}')
+#     llm = model_variables['llm']
+#     template = model_variables['html_parse_template']
+#     parse_prompt = ChatPromptTemplate.from_template(template)
+#     setup = RunnablePassthrough()
+#     output_parser = StrOutputParser()
+#
+#     chain = setup | parse_prompt | llm | output_parser
+#     input_html = {"html": html_content}
+#
+#     markdown = chain.invoke(input_html)
+#
+#     current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
+#                              f'on document version {document_version.id}')
+#
+#     return markdown
+
+
 def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
-    current_app.logger.debug(f'Generating Markdown from HTML for tenant {tenant.id} '
+    current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
                             f'on document version {document_version.id}')
+
    llm = model_variables['llm']
    template = model_variables['html_parse_template']
    parse_prompt = ChatPromptTemplate.from_template(template)
    setup = RunnablePassthrough()
    output_parser = StrOutputParser()
-
    chain = setup | parse_prompt | llm | output_parser
-    input_html = {"html": html_content}

-    markdown = chain.invoke(input_html)
+    soup = BeautifulSoup(html_content, 'lxml')
+
+    def split_content(soup, max_size=20000):
+        chunks = []
+        current_chunk = []
+        current_size = 0
+
+        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
+            element_html = str(element)
+            element_size = len(element_html)
+
+            if current_size + element_size > max_size and current_chunk:
+                chunks.append(''.join(map(str, current_chunk)))
+                current_chunk = []
+                current_size = 0
+
+            current_chunk.append(element)
+            current_size += element_size
+
+            if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
+                chunks.append(''.join(map(str, current_chunk)))
+                current_chunk = []
+                current_size = 0
+
+        if current_chunk:
+            chunks.append(''.join(map(str, current_chunk)))
+
+        return chunks
+
+    chunks = split_content(soup)
+
+    markdown_chunks = []
+
+    for chunk in chunks:
+        current_app.logger.debug(f'Processing chunk to generate markdown from HTML for tenant {tenant.id} '
+                                 f'on document version {document_version.id}')
+        if tenant.embed_tuning:
+            current_app.embed_tuning_logger.debug(f'Processing chunk: \n '
+                                                  f'------------------\n'
+                                                  f'{chunk}\n'
+                                                  f'------------------\n')
+        input_html = {"html": chunk}
+        markdown_chunk = chain.invoke(input_html)
+        markdown_chunks.append(markdown_chunk)
+        if tenant.embed_tuning:
+            current_app.embed_tuning_logger.debug(f'Processed markdown chunk: \n '
+                                                  f'-------------------------\n'
+                                                  f'{markdown_chunk}\n'
+                                                  f'-------------------------\n')
+        current_app.logger.debug(f'Finished processing chunk to generate markdown from HTML for tenant {tenant.id} '
+                                 f'on document version {document_version.id}')
+
+    # Combine all markdown chunks
+    markdown = "\n\n".join(markdown_chunks)
+
+    current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
+                             f'on document version {document_version.id}')

    return markdown

@@ -324,36 +402,73 @@ def embed_chunks(tenant, model_variables, document_version, chunks):


 def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
+    current_app.logger.debug(f'Parsing HTML for tenant {tenant.id}')
    soup = BeautifulSoup(html_content, 'html.parser')
    extracted_html = ''
+    excluded_classes = parse_excluded_classes(tenant.html_excluded_classes)

    if included_elements:
        elements_to_parse = soup.find_all(included_elements)
    else:
-        elements_to_parse = [soup]  # parse the entire document if no included_elements specified
+        elements_to_parse = [soup]

+    log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse)
+
+    for element in elements_to_parse:
+        for sub_element in element.find_all(tags):
+            if should_exclude_element(sub_element, excluded_elements, excluded_classes):
+                continue
+            extracted_html += extract_element_content(sub_element)
+
+    title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
+
+    current_app.logger.debug(f'Finished parsing HTML for tenant {tenant.id}')
+
+    return extracted_html, title
+
+
+def parse_excluded_classes(excluded_classes):
+    parsed = {}
+    for rule in excluded_classes:
+        element, cls = rule.split('.', 1)
+        parsed.setdefault(element, set()).add(cls)
+    return parsed
+
+
+def should_exclude_element(element, excluded_elements, excluded_classes):
+    if excluded_elements and element.find_parent(excluded_elements):
+        return True
+    return is_element_excluded_by_class(element, excluded_classes)
+
+
+def is_element_excluded_by_class(element, excluded_classes):
+    for parent in element.parents:
+        if element_matches_exclusion(parent, excluded_classes):
+            return True
+    return element_matches_exclusion(element, excluded_classes)
+
+
+def element_matches_exclusion(element, excluded_classes):
+    if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
+        return True
+    return element.name in excluded_classes and \
+        any(cls in excluded_classes[element.name] for cls in element.get('class', []))
+
+
+def extract_element_content(element):
+    content = ' '.join(child.strip() for child in element.stripped_strings)
+    return f'<{element.name}>{content}</{element.name}>\n'
+
+
+def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
    if tenant.embed_tuning:
        current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
        current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
-        current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
        current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
+        current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
        current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
        current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')

-    # Iterate through the found included elements
-    for element in elements_to_parse:
-        # Find all specified tags within each included element
-        for sub_element in element.find_all(tags):
-            if tenant.embed_tuning:
-                current_app.embed_tuning_logger.debug(f'Found element: {sub_element.name}')
-            if excluded_elements and sub_element.find_parent(excluded_elements):
-                continue  # Skip this sub_element if it's within any of the excluded_elements
-            extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}</{sub_element.name}>\n'
-
-    title = soup.find('title').get_text(strip=True)
-
-    return extracted_html, title
-

 def process_youtube(tenant, model_variables, document_version):
    base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
@@ -414,8 +529,9 @@ def download_youtube(url, tenant_id, document_version, file_name):
            with open(temp_file.name, 'rb') as f:
                file_data = f.read()

-        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
-                             file_name, file_data)
+        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
+                                          document_version.id,
+                                          file_name, file_data)

        current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
        return file_name, yt.title, yt.description, yt.author
@@ -429,7 +545,7 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
        current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')

        input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
-                                            document_version.id, input_file)
+                                                         document_version.id, input_file)

        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
            temp_input.write(input_data)
@@ -448,8 +564,9 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
                with open(temp_output.name, 'rb') as f:
                    compressed_data = f.read()

-        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
-                             output_file, compressed_data)
+        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
+                                          document_version.id,
+                                          output_file, compressed_data)

        current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
    except Exception as e:
@@ -465,7 +582,7 @@ def transcribe_audio(tenant_id, document_version, input_file, output_file, model

        # Download the audio file from MinIO
        audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
-                                            document_version.id, input_file)
+                                                         document_version.id, input_file)

        # Load the audio data into pydub
        audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
@@ -649,6 +766,3 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):

    return actual_chunks
    pass
-
-
-