Added excluded element classes to HTML parsing to allow for more complex document parsing
Added chunking to conversion of HTML to markdown in case of large files
This commit is contained in:
@@ -35,10 +35,11 @@ class Tenant(db.Model):
|
||||
html_end_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'li'])
|
||||
html_included_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
|
||||
html_excluded_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
|
||||
html_excluded_classes = db.Column(ARRAY(sa.String(200)), nullable=True)
|
||||
|
||||
min_chunk_size = db.Column(db.Integer, nullable=True, default=2000)
|
||||
max_chunk_size = db.Column(db.Integer, nullable=True, default=3000)
|
||||
|
||||
|
||||
# Embedding search variables
|
||||
es_k = db.Column(db.Integer, nullable=True, default=5)
|
||||
es_similarity_threshold = db.Column(db.Float, nullable=True, default=0.7)
|
||||
@@ -80,6 +81,7 @@ class Tenant(db.Model):
|
||||
'html_end_tags': self.html_end_tags,
|
||||
'html_included_elements': self.html_included_elements,
|
||||
'html_excluded_elements': self.html_excluded_elements,
|
||||
'html_excluded_classes': self.html_excluded_classes,
|
||||
'min_chunk_size': self.min_chunk_size,
|
||||
'max_chunk_size': self.max_chunk_size,
|
||||
'es_k': self.es_k,
|
||||
|
||||
@@ -86,6 +86,7 @@ def select_model_variables(tenant):
|
||||
model_variables['html_end_tags'] = tenant.html_end_tags
|
||||
model_variables['html_included_elements'] = tenant.html_included_elements
|
||||
model_variables['html_excluded_elements'] = tenant.html_excluded_elements
|
||||
model_variables['html_excluded_classes'] = tenant.html_excluded_classes
|
||||
|
||||
# Set Chunk Size variables
|
||||
model_variables['min_chunk_size'] = tenant.min_chunk_size
|
||||
|
||||
@@ -32,6 +32,7 @@ class TenantForm(FlaskForm):
|
||||
default='p, li')
|
||||
html_included_elements = StringField('HTML Included Elements', validators=[Optional()])
|
||||
html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()])
|
||||
html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()])
|
||||
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000)
|
||||
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000)
|
||||
# Embedding Search variables
|
||||
|
||||
@@ -68,6 +68,8 @@ def tenant():
|
||||
if form.html_included_elements.data else []
|
||||
new_tenant.html_excluded_elements = [tag.strip() for tag in form.html_excluded_elements.data.split(',')] \
|
||||
if form.html_excluded_elements.data else []
|
||||
new_tenant.html_excluded_classes = [cls.strip() for cls in form.html_excluded_classes.data.split(',')] \
|
||||
if form.html_excluded_classes.data else []
|
||||
|
||||
current_app.logger.debug(f'html_tags: {new_tenant.html_tags},'
|
||||
f'html_end_tags: {new_tenant.html_end_tags},'
|
||||
@@ -123,6 +125,8 @@ def edit_tenant(tenant_id):
|
||||
form.html_included_elements.data = ', '.join(tenant.html_included_elements)
|
||||
if tenant.html_excluded_elements:
|
||||
form.html_excluded_elements.data = ', '.join(tenant.html_excluded_elements)
|
||||
if tenant.html_excluded_classes:
|
||||
form.html_excluded_classes.data = ', '.join(tenant.html_excluded_classes)
|
||||
|
||||
if form.validate_on_submit():
|
||||
# Populate the tenant with form data
|
||||
@@ -134,6 +138,8 @@ def edit_tenant(tenant_id):
|
||||
elem.strip()]
|
||||
tenant.html_excluded_elements = [elem.strip() for elem in form.html_excluded_elements.data.split(',') if
|
||||
elem.strip()]
|
||||
tenant.html_excluded_classes = [elem.strip() for elem in form.html_excluded_classes.data.split(',') if
|
||||
elem.strip()]
|
||||
|
||||
db.session.commit()
|
||||
flash('Tenant updated successfully.', 'success')
|
||||
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
import subprocess
|
||||
|
||||
|
||||
import gevent
|
||||
from bs4 import BeautifulSoup
|
||||
import html
|
||||
@@ -12,6 +11,7 @@ from flask import current_app
|
||||
# OpenAI imports
|
||||
from langchain.chains.summarize import load_summarize_chain
|
||||
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.exceptions import LangChainException
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
@@ -105,7 +105,7 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
|
||||
def process_pdf(tenant, model_variables, document_version):
|
||||
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, document_version.file_name)
|
||||
document_version.id, document_version.file_name)
|
||||
|
||||
pdf_text = ''
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
|
||||
@@ -114,8 +114,9 @@ def process_pdf(tenant, model_variables, document_version):
|
||||
|
||||
markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
@@ -160,7 +161,7 @@ def delete_embeddings_for_document_version(document_version):
|
||||
|
||||
def process_html(tenant, model_variables, document_version):
|
||||
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id, document_version.file_name)
|
||||
document_version.id, document_version.file_name)
|
||||
html_content = file_data.decode('utf-8')
|
||||
|
||||
# The tags to be considered can be dependent on the tenant
|
||||
@@ -173,13 +174,15 @@ def process_html(tenant, model_variables, document_version):
|
||||
excluded_elements=html_excluded_elements)
|
||||
|
||||
extracted_file_name = f'{document_version.id}-extracted.html'
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
extracted_file_name, extracted_html.encode())
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id,
|
||||
extracted_file_name, extracted_html.encode())
|
||||
|
||||
markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
|
||||
document_version.id,
|
||||
markdown_file_name, markdown.encode())
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
@@ -235,19 +238,94 @@ def enrich_chunks(tenant, document_version, title, chunks):
|
||||
return enriched_chunks
|
||||
|
||||
|
||||
# def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
|
||||
# current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
|
||||
# f'on document version {document_version.id}')
|
||||
# llm = model_variables['llm']
|
||||
# template = model_variables['html_parse_template']
|
||||
# parse_prompt = ChatPromptTemplate.from_template(template)
|
||||
# setup = RunnablePassthrough()
|
||||
# output_parser = StrOutputParser()
|
||||
#
|
||||
# chain = setup | parse_prompt | llm | output_parser
|
||||
# input_html = {"html": html_content}
|
||||
#
|
||||
# markdown = chain.invoke(input_html)
|
||||
#
|
||||
# current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
|
||||
# f'on document version {document_version.id}')
|
||||
#
|
||||
# return markdown
|
||||
|
||||
|
||||
def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
|
||||
current_app.logger.debug(f'Generating Markdown from HTML for tenant {tenant.id} '
|
||||
current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
|
||||
llm = model_variables['llm']
|
||||
template = model_variables['html_parse_template']
|
||||
parse_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
|
||||
chain = setup | parse_prompt | llm | output_parser
|
||||
input_html = {"html": html_content}
|
||||
|
||||
markdown = chain.invoke(input_html)
|
||||
soup = BeautifulSoup(html_content, 'lxml')
|
||||
|
||||
def split_content(soup, max_size=20000):
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
|
||||
element_html = str(element)
|
||||
element_size = len(element_html)
|
||||
|
||||
if current_size + element_size > max_size and current_chunk:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
current_chunk.append(element)
|
||||
current_size += element_size
|
||||
|
||||
if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
|
||||
return chunks
|
||||
|
||||
chunks = split_content(soup)
|
||||
|
||||
markdown_chunks = []
|
||||
|
||||
for chunk in chunks:
|
||||
current_app.logger.debug(f'Processing chunk to generate markdown from HTML for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Processing chunk: \n '
|
||||
f'------------------\n'
|
||||
f'{chunk}\n'
|
||||
f'------------------\n')
|
||||
input_html = {"html": chunk}
|
||||
markdown_chunk = chain.invoke(input_html)
|
||||
markdown_chunks.append(markdown_chunk)
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Processed markdown chunk: \n '
|
||||
f'-------------------------\n'
|
||||
f'{markdown_chunk}\n'
|
||||
f'-------------------------\n')
|
||||
current_app.logger.debug(f'Finished processing chunk to generate markdown from HTML for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
|
||||
# Combine all markdown chunks
|
||||
markdown = "\n\n".join(markdown_chunks)
|
||||
|
||||
current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
|
||||
f'on document version {document_version.id}')
|
||||
|
||||
return markdown
|
||||
|
||||
@@ -324,36 +402,73 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
|
||||
|
||||
def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
|
||||
current_app.logger.debug(f'Parsing HTML for tenant {tenant.id}')
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
extracted_html = ''
|
||||
excluded_classes = parse_excluded_classes(tenant.html_excluded_classes)
|
||||
|
||||
if included_elements:
|
||||
elements_to_parse = soup.find_all(included_elements)
|
||||
else:
|
||||
elements_to_parse = [soup] # parse the entire document if no included_elements specified
|
||||
elements_to_parse = [soup]
|
||||
|
||||
log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse)
|
||||
|
||||
for element in elements_to_parse:
|
||||
for sub_element in element.find_all(tags):
|
||||
if should_exclude_element(sub_element, excluded_elements, excluded_classes):
|
||||
continue
|
||||
extracted_html += extract_element_content(sub_element)
|
||||
|
||||
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
|
||||
|
||||
current_app.logger.debug(f'Finished parsing HTML for tenant {tenant.id}')
|
||||
|
||||
return extracted_html, title
|
||||
|
||||
|
||||
def parse_excluded_classes(excluded_classes):
|
||||
parsed = {}
|
||||
for rule in excluded_classes:
|
||||
element, cls = rule.split('.', 1)
|
||||
parsed.setdefault(element, set()).add(cls)
|
||||
return parsed
|
||||
|
||||
|
||||
def should_exclude_element(element, excluded_elements, excluded_classes):
|
||||
if excluded_elements and element.find_parent(excluded_elements):
|
||||
return True
|
||||
return is_element_excluded_by_class(element, excluded_classes)
|
||||
|
||||
|
||||
def is_element_excluded_by_class(element, excluded_classes):
|
||||
for parent in element.parents:
|
||||
if element_matches_exclusion(parent, excluded_classes):
|
||||
return True
|
||||
return element_matches_exclusion(element, excluded_classes)
|
||||
|
||||
|
||||
def element_matches_exclusion(element, excluded_classes):
|
||||
if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
|
||||
return True
|
||||
return element.name in excluded_classes and \
|
||||
any(cls in excluded_classes[element.name] for cls in element.get('class', []))
|
||||
|
||||
|
||||
def extract_element_content(element):
|
||||
content = ' '.join(child.strip() for child in element.stripped_strings)
|
||||
return f'<{element.name}>{content}</{element.name}>\n'
|
||||
|
||||
|
||||
def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
|
||||
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
|
||||
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
|
||||
|
||||
# Iterate through the found included elements
|
||||
for element in elements_to_parse:
|
||||
# Find all specified tags within each included element
|
||||
for sub_element in element.find_all(tags):
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Found element: {sub_element.name}')
|
||||
if excluded_elements and sub_element.find_parent(excluded_elements):
|
||||
continue # Skip this sub_element if it's within any of the excluded_elements
|
||||
extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}</{sub_element.name}>\n'
|
||||
|
||||
title = soup.find('title').get_text(strip=True)
|
||||
|
||||
return extracted_html, title
|
||||
|
||||
|
||||
def process_youtube(tenant, model_variables, document_version):
|
||||
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
@@ -414,8 +529,9 @@ def download_youtube(url, tenant_id, document_version, file_name):
|
||||
with open(temp_file.name, 'rb') as f:
|
||||
file_data = f.read()
|
||||
|
||||
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
|
||||
file_name, file_data)
|
||||
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
|
||||
document_version.id,
|
||||
file_name, file_data)
|
||||
|
||||
current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
|
||||
return file_name, yt.title, yt.description, yt.author
|
||||
@@ -429,7 +545,7 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
|
||||
current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
|
||||
|
||||
input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
|
||||
document_version.id, input_file)
|
||||
document_version.id, input_file)
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
|
||||
temp_input.write(input_data)
|
||||
@@ -448,8 +564,9 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
|
||||
with open(temp_output.name, 'rb') as f:
|
||||
compressed_data = f.read()
|
||||
|
||||
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id,
|
||||
output_file, compressed_data)
|
||||
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
|
||||
document_version.id,
|
||||
output_file, compressed_data)
|
||||
|
||||
current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
|
||||
except Exception as e:
|
||||
@@ -465,7 +582,7 @@ def transcribe_audio(tenant_id, document_version, input_file, output_file, model
|
||||
|
||||
# Download the audio file from MinIO
|
||||
audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
|
||||
document_version.id, input_file)
|
||||
document_version.id, input_file)
|
||||
|
||||
# Load the audio data into pydub
|
||||
audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
|
||||
@@ -649,6 +766,3 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
|
||||
return actual_chunks
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
@@ -3,7 +3,7 @@
|
||||
Plugin Name: EveAI Chat Widget
|
||||
Plugin URI: https://askeveai.com/
|
||||
Description: Integrates the EveAI chat interface into your WordPress site.
|
||||
Version: 1.3.21
|
||||
Version: 1.3.23
|
||||
Author: Josako, Pieter Laroy
|
||||
Author URI: https://askeveai.com/about/
|
||||
*/
|
||||
|
||||
@@ -161,24 +161,32 @@ class EveAIChatWidget extends HTMLElement {
|
||||
|
||||
this.socket.on('connect', (data) => {
|
||||
console.log('Socket connected OK');
|
||||
console.log('Connect event data:', data);
|
||||
console.log('Connect event this:', this);
|
||||
this.setStatusMessage('Connected to EveAI.');
|
||||
this.updateConnectionStatus(true);
|
||||
this.startHeartbeat();
|
||||
if (data.room) {
|
||||
if (data && data.room) {
|
||||
this.room = data.room;
|
||||
console.log(`Joined room: ${this.room}`);
|
||||
} else {
|
||||
console.log('Room information not received on connect');
|
||||
}
|
||||
});
|
||||
|
||||
this.socket.on('authenticated', (data) => {
|
||||
console.log('Authenticated event received: ', data);
|
||||
console.log('Authenticated event received');
|
||||
console.log('Authentication event data:', data);
|
||||
console.log('Authentication event this:', this);
|
||||
this.setStatusMessage('Authenticated.');
|
||||
if (data.token) {
|
||||
this.jwtToken = data.token; // Store the JWT token received from the server
|
||||
if (data && data.token) {
|
||||
this.jwtToken = data.token;
|
||||
}
|
||||
if (data.room) {
|
||||
if (data && data.room) {
|
||||
this.room = data.room;
|
||||
console.log(`Confirmed room: ${this.room}`);
|
||||
} else {
|
||||
console.log('Room information not received on authentication');
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -386,6 +386,7 @@ input[type="radio"] {
|
||||
.btn-danger:hover {
|
||||
background-color: darken(var(--bs-danger), 10%) !important; /* Darken the background on hover */
|
||||
border-color: darken(var(--bs-danger), 10%) !important; /* Darken the border on hover */
|
||||
color: var(--bs-white) !important; /* Ensure the text remains white and readable */
|
||||
}
|
||||
|
||||
/* Success Alert Styling */
|
||||
|
||||
@@ -73,4 +73,6 @@ minio~=7.2.7
|
||||
Werkzeug~=3.0.3
|
||||
itsdangerous~=2.2.0
|
||||
cryptography~=43.0.0
|
||||
graypy~=2.1.0
|
||||
graypy~=2.1.0
|
||||
|
||||
lxml~=5.3.0
|
||||
|
||||
Reference in New Issue
Block a user