Added excluded element classes to HTML parsing to allow for more complex document parsing

Added chunking to conversion of HTML to markdown in case of large files
This commit is contained in:
Josako
2024-08-22 16:41:13 +02:00
parent a9f9b04117
commit 2ca006d82c
10 changed files with 181 additions and 46 deletions

View File

@@ -35,10 +35,11 @@ class Tenant(db.Model):
html_end_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'li'])
html_included_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
html_excluded_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
html_excluded_classes = db.Column(ARRAY(sa.String(200)), nullable=True)
min_chunk_size = db.Column(db.Integer, nullable=True, default=2000)
max_chunk_size = db.Column(db.Integer, nullable=True, default=3000)
# Embedding search variables
es_k = db.Column(db.Integer, nullable=True, default=5)
es_similarity_threshold = db.Column(db.Float, nullable=True, default=0.7)
@@ -80,6 +81,7 @@ class Tenant(db.Model):
'html_end_tags': self.html_end_tags,
'html_included_elements': self.html_included_elements,
'html_excluded_elements': self.html_excluded_elements,
'html_excluded_classes': self.html_excluded_classes,
'min_chunk_size': self.min_chunk_size,
'max_chunk_size': self.max_chunk_size,
'es_k': self.es_k,

View File

@@ -86,6 +86,7 @@ def select_model_variables(tenant):
model_variables['html_end_tags'] = tenant.html_end_tags
model_variables['html_included_elements'] = tenant.html_included_elements
model_variables['html_excluded_elements'] = tenant.html_excluded_elements
model_variables['html_excluded_classes'] = tenant.html_excluded_classes
# Set Chunk Size variables
model_variables['min_chunk_size'] = tenant.min_chunk_size

View File

@@ -32,6 +32,7 @@ class TenantForm(FlaskForm):
default='p, li')
html_included_elements = StringField('HTML Included Elements', validators=[Optional()])
html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()])
html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()])
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000)
# Embedding Search variables

View File

@@ -68,6 +68,8 @@ def tenant():
if form.html_included_elements.data else []
new_tenant.html_excluded_elements = [tag.strip() for tag in form.html_excluded_elements.data.split(',')] \
if form.html_excluded_elements.data else []
new_tenant.html_excluded_classes = [cls.strip() for cls in form.html_excluded_classes.data.split(',')] \
if form.html_excluded_classes.data else []
current_app.logger.debug(f'html_tags: {new_tenant.html_tags},'
f'html_end_tags: {new_tenant.html_end_tags},'
@@ -123,6 +125,8 @@ def edit_tenant(tenant_id):
form.html_included_elements.data = ', '.join(tenant.html_included_elements)
if tenant.html_excluded_elements:
form.html_excluded_elements.data = ', '.join(tenant.html_excluded_elements)
if tenant.html_excluded_classes:
form.html_excluded_classes.data = ', '.join(tenant.html_excluded_classes)
if form.validate_on_submit():
# Populate the tenant with form data
@@ -134,6 +138,8 @@ def edit_tenant(tenant_id):
elem.strip()]
tenant.html_excluded_elements = [elem.strip() for elem in form.html_excluded_elements.data.split(',') if
elem.strip()]
tenant.html_excluded_classes = [elem.strip() for elem in form.html_excluded_classes.data.split(',') if
elem.strip()]
db.session.commit()
flash('Tenant updated successfully.', 'success')

View File

@@ -3,7 +3,6 @@ import os
from datetime import datetime as dt, timezone as tz
import subprocess
import gevent
from bs4 import BeautifulSoup
import html
@@ -12,6 +11,7 @@ from flask import current_app
# OpenAI imports
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from langchain_core.exceptions import LangChainException
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
@@ -105,7 +105,7 @@ def create_embeddings(tenant_id, document_version_id):
def process_pdf(tenant, model_variables, document_version):
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, document_version.file_name)
document_version.id, document_version.file_name)
pdf_text = ''
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
@@ -114,8 +114,9 @@ def process_pdf(tenant, model_variables, document_version):
markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
markdown_file_name = f'{document_version.id}.md'
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
markdown_file_name, markdown.encode())
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id,
markdown_file_name, markdown.encode())
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
@@ -160,7 +161,7 @@ def delete_embeddings_for_document_version(document_version):
def process_html(tenant, model_variables, document_version):
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, document_version.file_name)
document_version.id, document_version.file_name)
html_content = file_data.decode('utf-8')
# The tags to be considered can be dependent on the tenant
@@ -173,13 +174,15 @@ def process_html(tenant, model_variables, document_version):
excluded_elements=html_excluded_elements)
extracted_file_name = f'{document_version.id}-extracted.html'
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
extracted_file_name, extracted_html.encode())
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id,
extracted_file_name, extracted_html.encode())
markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
markdown_file_name = f'{document_version.id}.md'
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
markdown_file_name, markdown.encode())
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id,
markdown_file_name, markdown.encode())
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
@@ -235,19 +238,94 @@ def enrich_chunks(tenant, document_version, title, chunks):
return enriched_chunks
# def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
# current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
# f'on document version {document_version.id}')
# llm = model_variables['llm']
# template = model_variables['html_parse_template']
# parse_prompt = ChatPromptTemplate.from_template(template)
# setup = RunnablePassthrough()
# output_parser = StrOutputParser()
#
# chain = setup | parse_prompt | llm | output_parser
# input_html = {"html": html_content}
#
# markdown = chain.invoke(input_html)
#
# current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
# f'on document version {document_version.id}')
#
# return markdown
def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
current_app.logger.debug(f'Generating Markdown from HTML for tenant {tenant.id} '
current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
llm = model_variables['llm']
template = model_variables['html_parse_template']
parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | parse_prompt | llm | output_parser
input_html = {"html": html_content}
markdown = chain.invoke(input_html)
soup = BeautifulSoup(html_content, 'lxml')
def split_content(soup, max_size=20000):
chunks = []
current_chunk = []
current_size = 0
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
element_html = str(element)
element_size = len(element_html)
if current_size + element_size > max_size and current_chunk:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
current_chunk.append(element)
current_size += element_size
if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
if current_chunk:
chunks.append(''.join(map(str, current_chunk)))
return chunks
chunks = split_content(soup)
markdown_chunks = []
for chunk in chunks:
current_app.logger.debug(f'Processing chunk to generate markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Processing chunk: \n '
f'------------------\n'
f'{chunk}\n'
f'------------------\n')
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Processed markdown chunk: \n '
f'-------------------------\n'
f'{markdown_chunk}\n'
f'-------------------------\n')
current_app.logger.debug(f'Finished processing chunk to generate markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
# Combine all markdown chunks
markdown = "\n\n".join(markdown_chunks)
current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
return markdown
@@ -324,36 +402,73 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
current_app.logger.debug(f'Parsing HTML for tenant {tenant.id}')
soup = BeautifulSoup(html_content, 'html.parser')
extracted_html = ''
excluded_classes = parse_excluded_classes(tenant.html_excluded_classes)
if included_elements:
elements_to_parse = soup.find_all(included_elements)
else:
elements_to_parse = [soup] # parse the entire document if no included_elements specified
elements_to_parse = [soup]
log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse)
for element in elements_to_parse:
for sub_element in element.find_all(tags):
if should_exclude_element(sub_element, excluded_elements, excluded_classes):
continue
extracted_html += extract_element_content(sub_element)
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
current_app.logger.debug(f'Finished parsing HTML for tenant {tenant.id}')
return extracted_html, title
def parse_excluded_classes(excluded_classes):
parsed = {}
for rule in excluded_classes:
element, cls = rule.split('.', 1)
parsed.setdefault(element, set()).add(cls)
return parsed
def should_exclude_element(element, excluded_elements, excluded_classes):
if excluded_elements and element.find_parent(excluded_elements):
return True
return is_element_excluded_by_class(element, excluded_classes)
def is_element_excluded_by_class(element, excluded_classes):
for parent in element.parents:
if element_matches_exclusion(parent, excluded_classes):
return True
return element_matches_exclusion(element, excluded_classes)
def element_matches_exclusion(element, excluded_classes):
if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
return True
return element.name in excluded_classes and \
any(cls in excluded_classes[element.name] for cls in element.get('class', []))
def extract_element_content(element):
content = ' '.join(child.strip() for child in element.stripped_strings)
return f'<{element.name}>{content}</{element.name}>\n'
def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
# Iterate through the found included elements
for element in elements_to_parse:
# Find all specified tags within each included element
for sub_element in element.find_all(tags):
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Found element: {sub_element.name}')
if excluded_elements and sub_element.find_parent(excluded_elements):
continue # Skip this sub_element if it's within any of the excluded_elements
extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}</{sub_element.name}>\n'
title = soup.find('title').get_text(strip=True)
return extracted_html, title
def process_youtube(tenant, model_variables, document_version):
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
@@ -414,8 +529,9 @@ def download_youtube(url, tenant_id, document_version, file_name):
with open(temp_file.name, 'rb') as f:
file_data = f.read()
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
file_name, file_data)
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id,
file_name, file_data)
current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
return file_name, yt.title, yt.description, yt.author
@@ -429,7 +545,7 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id, input_file)
document_version.id, input_file)
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
temp_input.write(input_data)
@@ -448,8 +564,9 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
with open(temp_output.name, 'rb') as f:
compressed_data = f.read()
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
output_file, compressed_data)
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id,
output_file, compressed_data)
current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
except Exception as e:
@@ -465,7 +582,7 @@ def transcribe_audio(tenant_id, document_version, input_file, output_file, model
# Download the audio file from MinIO
audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id, input_file)
document_version.id, input_file)
# Load the audio data into pydub
audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
@@ -649,6 +766,3 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
return actual_chunks
pass

View File

@@ -3,7 +3,7 @@
Plugin Name: EveAI Chat Widget
Plugin URI: https://askeveai.com/
Description: Integrates the EveAI chat interface into your WordPress site.
Version: 1.3.21
Version: 1.3.23
Author: Josako, Pieter Laroy
Author URI: https://askeveai.com/about/
*/

View File

@@ -161,24 +161,32 @@ class EveAIChatWidget extends HTMLElement {
this.socket.on('connect', (data) => {
console.log('Socket connected OK');
console.log('Connect event data:', data);
console.log('Connect event this:', this);
this.setStatusMessage('Connected to EveAI.');
this.updateConnectionStatus(true);
this.startHeartbeat();
if (data.room) {
if (data && data.room) {
this.room = data.room;
console.log(`Joined room: ${this.room}`);
} else {
console.log('Room information not received on connect');
}
});
this.socket.on('authenticated', (data) => {
console.log('Authenticated event received: ', data);
console.log('Authenticated event received');
console.log('Authentication event data:', data);
console.log('Authentication event this:', this);
this.setStatusMessage('Authenticated.');
if (data.token) {
this.jwtToken = data.token; // Store the JWT token received from the server
if (data && data.token) {
this.jwtToken = data.token;
}
if (data.room) {
if (data && data.room) {
this.room = data.room;
console.log(`Confirmed room: ${this.room}`);
} else {
console.log('Room information not received on authentication');
}
});

View File

@@ -386,6 +386,7 @@ input[type="radio"] {
.btn-danger:hover {
background-color: darken(var(--bs-danger), 10%) !important; /* Darken the background on hover */
border-color: darken(var(--bs-danger), 10%) !important; /* Darken the border on hover */
color: var(--bs-white) !important; /* Ensure the text remains white and readable */
}
/* Success Alert Styling */

View File

@@ -74,3 +74,5 @@ Werkzeug~=3.0.3
itsdangerous~=2.2.0
cryptography~=43.0.0
graypy~=2.1.0
lxml~=5.3.0