Added excluded element classes to HTML parsing to allow for more complex document parsing

Added chunking to conversion of HTML to markdown in case of large files
2024-08-22 16:41:13 +02:00
parent a9f9b04117
commit 2ca006d82c
10 changed files with 181 additions and 46 deletions
--- a/common/models/user.py
+++ b/common/models/user.py
@@ -35,10 +35,11 @@ class Tenant(db.Model):
    html_end_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'li'])
    html_included_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
    html_excluded_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
+    html_excluded_classes = db.Column(ARRAY(sa.String(200)), nullable=True)
+
    min_chunk_size = db.Column(db.Integer, nullable=True, default=2000)
    max_chunk_size = db.Column(db.Integer, nullable=True, default=3000)

-
    # Embedding search variables
    es_k = db.Column(db.Integer, nullable=True, default=5)
    es_similarity_threshold = db.Column(db.Float, nullable=True, default=0.7)
@@ -80,6 +81,7 @@ class Tenant(db.Model):
            'html_end_tags': self.html_end_tags,
            'html_included_elements': self.html_included_elements,
            'html_excluded_elements': self.html_excluded_elements,
+            'html_excluded_classes': self.html_excluded_classes,
            'min_chunk_size': self.min_chunk_size,
            'max_chunk_size': self.max_chunk_size,
            'es_k': self.es_k,
--- a/common/utils/model_utils.py
+++ b/common/utils/model_utils.py
@@ -86,6 +86,7 @@ def select_model_variables(tenant):
    model_variables['html_end_tags'] = tenant.html_end_tags
    model_variables['html_included_elements'] = tenant.html_included_elements
    model_variables['html_excluded_elements'] = tenant.html_excluded_elements
+    model_variables['html_excluded_classes'] = tenant.html_excluded_classes

    # Set Chunk Size variables
    model_variables['min_chunk_size'] = tenant.min_chunk_size
--- a/eveai_app/views/user_forms.py
+++ b/eveai_app/views/user_forms.py
@@ -32,6 +32,7 @@ class TenantForm(FlaskForm):
                                default='p, li')
    html_included_elements = StringField('HTML Included Elements', validators=[Optional()])
    html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()])
+    html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()])
    min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000)
    max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000)
    # Embedding Search variables
--- a/eveai_app/views/user_views.py
+++ b/eveai_app/views/user_views.py
@@ -68,6 +68,8 @@ def tenant():
            if form.html_included_elements.data else []
        new_tenant.html_excluded_elements = [tag.strip() for tag in form.html_excluded_elements.data.split(',')] \
            if form.html_excluded_elements.data else []
+        new_tenant.html_excluded_classes = [cls.strip() for cls in form.html_excluded_classes.data.split(',')] \
+            if form.html_excluded_classes.data else []

        current_app.logger.debug(f'html_tags: {new_tenant.html_tags},'
                                 f'html_end_tags: {new_tenant.html_end_tags},'
@@ -123,6 +125,8 @@ def edit_tenant(tenant_id):
            form.html_included_elements.data = ', '.join(tenant.html_included_elements)
        if tenant.html_excluded_elements:
            form.html_excluded_elements.data = ', '.join(tenant.html_excluded_elements)
+        if tenant.html_excluded_classes:
+            form.html_excluded_classes.data = ', '.join(tenant.html_excluded_classes)

    if form.validate_on_submit():
        # Populate the tenant with form data
@@ -134,6 +138,8 @@ def edit_tenant(tenant_id):
                                         elem.strip()]
        tenant.html_excluded_elements = [elem.strip() for elem in form.html_excluded_elements.data.split(',') if
                                         elem.strip()]
+        tenant.html_excluded_classes = [elem.strip() for elem in form.html_excluded_classes.data.split(',') if
+                                        elem.strip()]

        db.session.commit()
        flash('Tenant updated successfully.', 'success')
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -3,7 +3,6 @@ import os
 from datetime import datetime as dt, timezone as tz
 import subprocess

-
 import gevent
 from bs4 import BeautifulSoup
 import html
@@ -12,6 +11,7 @@ from flask import current_app
 # OpenAI imports
 from langchain.chains.summarize import load_summarize_chain
 from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
+from langchain_core.documents import Document
 from langchain_core.exceptions import LangChainException
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
@@ -105,7 +105,7 @@ def create_embeddings(tenant_id, document_version_id):

 def process_pdf(tenant, model_variables, document_version):
    file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                       document_version.id, document_version.file_name)
+                                                    document_version.id, document_version.file_name)

    pdf_text = ''
    pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
@@ -114,8 +114,9 @@ def process_pdf(tenant, model_variables, document_version):

    markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
    markdown_file_name = f'{document_version.id}.md'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
-                         markdown_file_name, markdown.encode())
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id,
+                                      markdown_file_name, markdown.encode())

    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
@@ -160,7 +161,7 @@ def delete_embeddings_for_document_version(document_version):

 def process_html(tenant, model_variables, document_version):
    file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
-                                       document_version.id, document_version.file_name)
+                                                    document_version.id, document_version.file_name)
    html_content = file_data.decode('utf-8')

    # The tags to be considered can be dependent on the tenant
@@ -173,13 +174,15 @@ def process_html(tenant, model_variables, document_version):
                                       excluded_elements=html_excluded_elements)

    extracted_file_name = f'{document_version.id}-extracted.html'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
-                         extracted_file_name, extracted_html.encode())
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id,
+                                      extracted_file_name, extracted_html.encode())

    markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
    markdown_file_name = f'{document_version.id}.md'
-    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
-                         markdown_file_name, markdown.encode())
+    minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
+                                      document_version.id,
+                                      markdown_file_name, markdown.encode())

    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
    chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
@@ -235,19 +238,94 @@ def enrich_chunks(tenant, document_version, title, chunks):
    return enriched_chunks


+# def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
+#     current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
+#                              f'on document version {document_version.id}')
+#     llm = model_variables['llm']
+#     template = model_variables['html_parse_template']
+#     parse_prompt = ChatPromptTemplate.from_template(template)
+#     setup = RunnablePassthrough()
+#     output_parser = StrOutputParser()
+#
+#     chain = setup | parse_prompt | llm | output_parser
+#     input_html = {"html": html_content}
+#
+#     markdown = chain.invoke(input_html)
+#
+#     current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
+#                              f'on document version {document_version.id}')
+#
+#     return markdown
+
+
 def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
-    current_app.logger.debug(f'Generating Markdown from HTML for tenant {tenant.id} '
+    current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
                             f'on document version {document_version.id}')
+
    llm = model_variables['llm']
    template = model_variables['html_parse_template']
    parse_prompt = ChatPromptTemplate.from_template(template)
    setup = RunnablePassthrough()
    output_parser = StrOutputParser()
-
    chain = setup | parse_prompt | llm | output_parser
-    input_html = {"html": html_content}

-    markdown = chain.invoke(input_html)
+    soup = BeautifulSoup(html_content, 'lxml')
+
+    def split_content(soup, max_size=20000):
+        chunks = []
+        current_chunk = []
+        current_size = 0
+
+        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
+            element_html = str(element)
+            element_size = len(element_html)
+
+            if current_size + element_size > max_size and current_chunk:
+                chunks.append(''.join(map(str, current_chunk)))
+                current_chunk = []
+                current_size = 0
+
+            current_chunk.append(element)
+            current_size += element_size
+
+            if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
+                chunks.append(''.join(map(str, current_chunk)))
+                current_chunk = []
+                current_size = 0
+
+        if current_chunk:
+            chunks.append(''.join(map(str, current_chunk)))
+
+        return chunks
+
+    chunks = split_content(soup)
+
+    markdown_chunks = []
+
+    for chunk in chunks:
+        current_app.logger.debug(f'Processing chunk to generate markdown from HTML for tenant {tenant.id} '
+                                 f'on document version {document_version.id}')
+        if tenant.embed_tuning:
+            current_app.embed_tuning_logger.debug(f'Processing chunk: \n '
+                                                  f'------------------\n'
+                                                  f'{chunk}\n'
+                                                  f'------------------\n')
+        input_html = {"html": chunk}
+        markdown_chunk = chain.invoke(input_html)
+        markdown_chunks.append(markdown_chunk)
+        if tenant.embed_tuning:
+            current_app.embed_tuning_logger.debug(f'Processed markdown chunk: \n '
+                                                  f'-------------------------\n'
+                                                  f'{markdown_chunk}\n'
+                                                  f'-------------------------\n')
+        current_app.logger.debug(f'Finished processing chunk to generate markdown from HTML for tenant {tenant.id} '
+                                 f'on document version {document_version.id}')
+
+    # Combine all markdown chunks
+    markdown = "\n\n".join(markdown_chunks)
+
+    current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
+                             f'on document version {document_version.id}')

    return markdown

@@ -324,36 +402,73 @@ def embed_chunks(tenant, model_variables, document_version, chunks):


 def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
+    current_app.logger.debug(f'Parsing HTML for tenant {tenant.id}')
    soup = BeautifulSoup(html_content, 'html.parser')
    extracted_html = ''
+    excluded_classes = parse_excluded_classes(tenant.html_excluded_classes)

    if included_elements:
        elements_to_parse = soup.find_all(included_elements)
    else:
-        elements_to_parse = [soup]  # parse the entire document if no included_elements specified
+        elements_to_parse = [soup]

+    log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse)
+
+    for element in elements_to_parse:
+        for sub_element in element.find_all(tags):
+            if should_exclude_element(sub_element, excluded_elements, excluded_classes):
+                continue
+            extracted_html += extract_element_content(sub_element)
+
+    title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
+
+    current_app.logger.debug(f'Finished parsing HTML for tenant {tenant.id}')
+
+    return extracted_html, title
+
+
+def parse_excluded_classes(excluded_classes):
+    parsed = {}
+    for rule in excluded_classes:
+        element, cls = rule.split('.', 1)
+        parsed.setdefault(element, set()).add(cls)
+    return parsed
+
+
+def should_exclude_element(element, excluded_elements, excluded_classes):
+    if excluded_elements and element.find_parent(excluded_elements):
+        return True
+    return is_element_excluded_by_class(element, excluded_classes)
+
+
+def is_element_excluded_by_class(element, excluded_classes):
+    for parent in element.parents:
+        if element_matches_exclusion(parent, excluded_classes):
+            return True
+    return element_matches_exclusion(element, excluded_classes)
+
+
+def element_matches_exclusion(element, excluded_classes):
+    if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
+        return True
+    return element.name in excluded_classes and \
+        any(cls in excluded_classes[element.name] for cls in element.get('class', []))
+
+
+def extract_element_content(element):
+    content = ' '.join(child.strip() for child in element.stripped_strings)
+    return f'<{element.name}>{content}</{element.name}>\n'
+
+
+def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
    if tenant.embed_tuning:
        current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
        current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
-        current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
        current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
+        current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
        current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
        current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')

-    # Iterate through the found included elements
-    for element in elements_to_parse:
-        # Find all specified tags within each included element
-        for sub_element in element.find_all(tags):
-            if tenant.embed_tuning:
-                current_app.embed_tuning_logger.debug(f'Found element: {sub_element.name}')
-            if excluded_elements and sub_element.find_parent(excluded_elements):
-                continue  # Skip this sub_element if it's within any of the excluded_elements
-            extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}</{sub_element.name}>\n'
-
-    title = soup.find('title').get_text(strip=True)
-
-    return extracted_html, title
-

 def process_youtube(tenant, model_variables, document_version):
    base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
@@ -414,8 +529,9 @@ def download_youtube(url, tenant_id, document_version, file_name):
            with open(temp_file.name, 'rb') as f:
                file_data = f.read()

-        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
-                             file_name, file_data)
+        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
+                                          document_version.id,
+                                          file_name, file_data)

        current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
        return file_name, yt.title, yt.description, yt.author
@@ -429,7 +545,7 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
        current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')

        input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
-                                            document_version.id, input_file)
+                                                         document_version.id, input_file)

        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
            temp_input.write(input_data)
@@ -448,8 +564,9 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
                with open(temp_output.name, 'rb') as f:
                    compressed_data = f.read()

-        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
-                             output_file, compressed_data)
+        minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
+                                          document_version.id,
+                                          output_file, compressed_data)

        current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
    except Exception as e:
@@ -465,7 +582,7 @@ def transcribe_audio(tenant_id, document_version, input_file, output_file, model

        # Download the audio file from MinIO
        audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
-                                            document_version.id, input_file)
+                                                         document_version.id, input_file)

        # Load the audio data into pydub
        audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
@@ -649,6 +766,3 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):

    return actual_chunks
    pass
-
-
-
--- a/integrations/Wordpress/eveai-chat-widget.zip
+++ b/integrations/Wordpress/eveai-chat-widget.zip
--- a/integrations/Wordpress/eveai-chat-widget/eveai-chat_plugin.php
+++ b/integrations/Wordpress/eveai-chat-widget/eveai-chat_plugin.php
@@ -3,7 +3,7 @@
 Plugin Name: EveAI Chat Widget
 Plugin URI: https://askeveai.com/
 Description: Integrates the EveAI chat interface into your WordPress site.
-Version: 1.3.21
+Version: 1.3.23
 Author: Josako, Pieter Laroy
 Author URI: https://askeveai.com/about/
 */
--- a/integrations/Wordpress/eveai-chat-widget/js/eveai-chat-widget.js
+++ b/integrations/Wordpress/eveai-chat-widget/js/eveai-chat-widget.js
@@ -161,24 +161,32 @@ class EveAIChatWidget extends HTMLElement {

    this.socket.on('connect', (data) => {
      console.log('Socket connected OK');
+      console.log('Connect event data:', data);
+      console.log('Connect event this:', this);
      this.setStatusMessage('Connected to EveAI.');
      this.updateConnectionStatus(true);
      this.startHeartbeat();
-      if (data.room) {
+      if (data && data.room) {
        this.room = data.room;
        console.log(`Joined room: ${this.room}`);
+      } else {
+        console.log('Room information not received on connect');
      }
    });

    this.socket.on('authenticated', (data) => {
-      console.log('Authenticated event received: ', data);
+      console.log('Authenticated event received');
+      console.log('Authentication event data:', data);
+      console.log('Authentication event this:', this);
      this.setStatusMessage('Authenticated.');
-      if (data.token) {
-        this.jwtToken = data.token; // Store the JWT token received from the server
+      if (data && data.token) {
+        this.jwtToken = data.token;
      }
-      if (data.room) {
+      if (data && data.room) {
        this.room = data.room;
        console.log(`Confirmed room: ${this.room}`);
+      } else {
+        console.log('Room information not received on authentication');
      }
    });

--- a/nginx/static/assets/css/eveai.css
+++ b/nginx/static/assets/css/eveai.css
@@ -386,6 +386,7 @@ input[type="radio"] {
 .btn-danger:hover {
    background-color: darken(var(--bs-danger), 10%) !important; /* Darken the background on hover */
    border-color: darken(var(--bs-danger), 10%) !important; /* Darken the border on hover */
+    color: var(--bs-white) !important; /* Ensure the text remains white and readable */
 }

 /* Success Alert Styling */
--- a/requirements.txt
+++ b/requirements.txt
@@ -74,3 +74,5 @@ Werkzeug~=3.0.3
 itsdangerous~=2.2.0
 cryptography~=43.0.0
 graypy~=2.1.0
+
+lxml~=5.3.0