diff --git a/common/models/user.py b/common/models/user.py index c700613..ffc9db4 100644 --- a/common/models/user.py +++ b/common/models/user.py @@ -35,10 +35,11 @@ class Tenant(db.Model): html_end_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'li']) html_included_elements = db.Column(ARRAY(sa.String(50)), nullable=True) html_excluded_elements = db.Column(ARRAY(sa.String(50)), nullable=True) + html_excluded_classes = db.Column(ARRAY(sa.String(200)), nullable=True) + min_chunk_size = db.Column(db.Integer, nullable=True, default=2000) max_chunk_size = db.Column(db.Integer, nullable=True, default=3000) - # Embedding search variables es_k = db.Column(db.Integer, nullable=True, default=5) es_similarity_threshold = db.Column(db.Float, nullable=True, default=0.7) @@ -80,6 +81,7 @@ class Tenant(db.Model): 'html_end_tags': self.html_end_tags, 'html_included_elements': self.html_included_elements, 'html_excluded_elements': self.html_excluded_elements, + 'html_excluded_classes': self.html_excluded_classes, 'min_chunk_size': self.min_chunk_size, 'max_chunk_size': self.max_chunk_size, 'es_k': self.es_k, diff --git a/common/utils/model_utils.py b/common/utils/model_utils.py index 320dd99..686cfa8 100644 --- a/common/utils/model_utils.py +++ b/common/utils/model_utils.py @@ -86,6 +86,7 @@ def select_model_variables(tenant): model_variables['html_end_tags'] = tenant.html_end_tags model_variables['html_included_elements'] = tenant.html_included_elements model_variables['html_excluded_elements'] = tenant.html_excluded_elements + model_variables['html_excluded_classes'] = tenant.html_excluded_classes # Set Chunk Size variables model_variables['min_chunk_size'] = tenant.min_chunk_size diff --git a/eveai_app/views/user_forms.py b/eveai_app/views/user_forms.py index a8fc984..4eb49b0 100644 --- a/eveai_app/views/user_forms.py +++ b/eveai_app/views/user_forms.py @@ -32,6 +32,7 @@ class TenantForm(FlaskForm): default='p, li') html_included_elements = StringField('HTML Included Elements', validators=[Optional()]) html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()]) + html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()]) min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000) max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000) # Embedding Search variables diff --git a/eveai_app/views/user_views.py b/eveai_app/views/user_views.py index e7afa51..943a935 100644 --- a/eveai_app/views/user_views.py +++ b/eveai_app/views/user_views.py @@ -68,6 +68,8 @@ def tenant(): if form.html_included_elements.data else [] new_tenant.html_excluded_elements = [tag.strip() for tag in form.html_excluded_elements.data.split(',')] \ if form.html_excluded_elements.data else [] + new_tenant.html_excluded_classes = [cls.strip() for cls in form.html_excluded_classes.data.split(',')] \ + if form.html_excluded_classes.data else [] current_app.logger.debug(f'html_tags: {new_tenant.html_tags},' f'html_end_tags: {new_tenant.html_end_tags},' @@ -123,6 +125,8 @@ def edit_tenant(tenant_id): form.html_included_elements.data = ', '.join(tenant.html_included_elements) if tenant.html_excluded_elements: form.html_excluded_elements.data = ', '.join(tenant.html_excluded_elements) + if tenant.html_excluded_classes: + form.html_excluded_classes.data = ', '.join(tenant.html_excluded_classes) if form.validate_on_submit(): # Populate the tenant with form data @@ -134,6 +138,8 @@ def edit_tenant(tenant_id): elem.strip()] tenant.html_excluded_elements = [elem.strip() for elem in form.html_excluded_elements.data.split(',') if elem.strip()] + tenant.html_excluded_classes = [elem.strip() for elem in form.html_excluded_classes.data.split(',') if + elem.strip()] db.session.commit() flash('Tenant updated successfully.', 'success') diff --git a/eveai_workers/tasks.py b/eveai_workers/tasks.py index d90b0ca..67cd18e 100644 --- a/eveai_workers/tasks.py +++ b/eveai_workers/tasks.py @@ -3,7 +3,6 @@ import os from datetime import datetime as dt, timezone as tz import subprocess - import gevent from bs4 import BeautifulSoup import html @@ -12,6 +11,7 @@ from flask import current_app # OpenAI imports from langchain.chains.summarize import load_summarize_chain from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter +from langchain_core.documents import Document from langchain_core.exceptions import LangChainException from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate @@ -105,7 +105,7 @@ def create_embeddings(tenant_id, document_version_id): def process_pdf(tenant, model_variables, document_version): file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language, - document_version.id, document_version.file_name) + document_version.id, document_version.file_name) pdf_text = '' pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data)) @@ -114,8 +114,9 @@ def process_pdf(tenant, model_variables, document_version): markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text) markdown_file_name = f'{document_version.id}.md' - minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id, - markdown_file_name, markdown.encode()) + minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, + document_version.id, + markdown_file_name, markdown.encode()) potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name) chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'], @@ -160,7 +161,7 @@ def delete_embeddings_for_document_version(document_version): def process_html(tenant, model_variables, document_version): file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language, - document_version.id, document_version.file_name) + document_version.id, document_version.file_name) html_content = file_data.decode('utf-8') # The tags to be considered can be dependent on the tenant @@ -173,13 +174,15 @@ def process_html(tenant, model_variables, document_version): excluded_elements=html_excluded_elements) extracted_file_name = f'{document_version.id}-extracted.html' - minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id, - extracted_file_name, extracted_html.encode()) + minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, + document_version.id, + extracted_file_name, extracted_html.encode()) markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html) markdown_file_name = f'{document_version.id}.md' - minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id, - markdown_file_name, markdown.encode()) + minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, + document_version.id, + markdown_file_name, markdown.encode()) potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name) chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'], @@ -235,19 +238,94 @@ def enrich_chunks(tenant, document_version, title, chunks): return enriched_chunks +# def generate_markdown_from_html(tenant, model_variables, document_version, html_content): +# current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} ' +# f'on document version {document_version.id}') +# llm = model_variables['llm'] +# template = model_variables['html_parse_template'] +# parse_prompt = ChatPromptTemplate.from_template(template) +# setup = RunnablePassthrough() +# output_parser = StrOutputParser() +# +# chain = setup | parse_prompt | llm | output_parser +# input_html = {"html": html_content} +# +# markdown = chain.invoke(input_html) +# +# current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} ' +# f'on document version {document_version.id}') +# +# return markdown + + def generate_markdown_from_html(tenant, model_variables, document_version, html_content): - current_app.logger.debug(f'Generating Markdown from HTML for tenant {tenant.id} ' + current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} ' f'on document version {document_version.id}') + llm = model_variables['llm'] template = model_variables['html_parse_template'] parse_prompt = ChatPromptTemplate.from_template(template) setup = RunnablePassthrough() output_parser = StrOutputParser() - chain = setup | parse_prompt | llm | output_parser - input_html = {"html": html_content} - markdown = chain.invoke(input_html) + soup = BeautifulSoup(html_content, 'lxml') + + def split_content(soup, max_size=20000): + chunks = [] + current_chunk = [] + current_size = 0 + + for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']): + element_html = str(element) + element_size = len(element_html) + + if current_size + element_size > max_size and current_chunk: + chunks.append(''.join(map(str, current_chunk))) + current_chunk = [] + current_size = 0 + + current_chunk.append(element) + current_size += element_size + + if element.name in ['h1', 'h2', 'h3'] and current_size > max_size: + chunks.append(''.join(map(str, current_chunk))) + current_chunk = [] + current_size = 0 + + if current_chunk: + chunks.append(''.join(map(str, current_chunk))) + + return chunks + + chunks = split_content(soup) + + markdown_chunks = [] + + for chunk in chunks: + current_app.logger.debug(f'Processing chunk to generate markdown from HTML for tenant {tenant.id} ' + f'on document version {document_version.id}') + if tenant.embed_tuning: + current_app.embed_tuning_logger.debug(f'Processing chunk: \n ' + f'------------------\n' + f'{chunk}\n' + f'------------------\n') + input_html = {"html": chunk} + markdown_chunk = chain.invoke(input_html) + markdown_chunks.append(markdown_chunk) + if tenant.embed_tuning: + current_app.embed_tuning_logger.debug(f'Processed markdown chunk: \n ' + f'-------------------------\n' + f'{markdown_chunk}\n' + f'-------------------------\n') + current_app.logger.debug(f'Finished processing chunk to generate markdown from HTML for tenant {tenant.id} ' + f'on document version {document_version.id}') + + # Combine all markdown chunks + markdown = "\n\n".join(markdown_chunks) + + current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} ' + f'on document version {document_version.id}') return markdown @@ -324,36 +402,73 @@ def embed_chunks(tenant, model_variables, document_version, chunks): def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None): + current_app.logger.debug(f'Parsing HTML for tenant {tenant.id}') soup = BeautifulSoup(html_content, 'html.parser') extracted_html = '' + excluded_classes = parse_excluded_classes(tenant.html_excluded_classes) if included_elements: elements_to_parse = soup.find_all(included_elements) else: - elements_to_parse = [soup] # parse the entire document if no included_elements specified + elements_to_parse = [soup] + log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse) + + for element in elements_to_parse: + for sub_element in element.find_all(tags): + if should_exclude_element(sub_element, excluded_elements, excluded_classes): + continue + extracted_html += extract_element_content(sub_element) + + title = soup.find('title').get_text(strip=True) if soup.find('title') else '' + + current_app.logger.debug(f'Finished parsing HTML for tenant {tenant.id}') + + return extracted_html, title + + +def parse_excluded_classes(excluded_classes): + parsed = {} + for rule in excluded_classes: + element, cls = rule.split('.', 1) + parsed.setdefault(element, set()).add(cls) + return parsed + + +def should_exclude_element(element, excluded_elements, excluded_classes): + if excluded_elements and element.find_parent(excluded_elements): + return True + return is_element_excluded_by_class(element, excluded_classes) + + +def is_element_excluded_by_class(element, excluded_classes): + for parent in element.parents: + if element_matches_exclusion(parent, excluded_classes): + return True + return element_matches_exclusion(element, excluded_classes) + + +def element_matches_exclusion(element, excluded_classes): + if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])): + return True + return element.name in excluded_classes and \ + any(cls in excluded_classes[element.name] for cls in element.get('class', [])) + + +def extract_element_content(element): + content = ' '.join(child.strip() for child in element.stripped_strings) + return f'<{element.name}>{content}\n' + + +def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse): if tenant.embed_tuning: current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}') current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}') - current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}') current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}') + current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}') current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse') current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}') - # Iterate through the found included elements - for element in elements_to_parse: - # Find all specified tags within each included element - for sub_element in element.find_all(tags): - if tenant.embed_tuning: - current_app.embed_tuning_logger.debug(f'Found element: {sub_element.name}') - if excluded_elements and sub_element.find_parent(excluded_elements): - continue # Skip this sub_element if it's within any of the excluded_elements - extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}\n' - - title = soup.find('title').get_text(strip=True) - - return extracted_html, title - def process_youtube(tenant, model_variables, document_version): base_path = os.path.join(current_app.config['UPLOAD_FOLDER'], @@ -414,8 +529,9 @@ def download_youtube(url, tenant_id, document_version, file_name): with open(temp_file.name, 'rb') as f: file_data = f.read() - minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id, - file_name, file_data) + minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, + document_version.id, + file_name, file_data) current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}') return file_name, yt.title, yt.description, yt.author @@ -429,7 +545,7 @@ def compress_audio(tenant_id, document_version, input_file, output_file): current_app.logger.info(f'Compressing audio for tenant: {tenant_id}') input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language, - document_version.id, input_file) + document_version.id, input_file) with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input: temp_input.write(input_data) @@ -448,8 +564,9 @@ def compress_audio(tenant_id, document_version, input_file, output_file): with open(temp_output.name, 'rb') as f: compressed_data = f.read() - minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id, - output_file, compressed_data) + minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, + document_version.id, + output_file, compressed_data) current_app.logger.info(f'Compressed audio for tenant: {tenant_id}') except Exception as e: @@ -465,7 +582,7 @@ def transcribe_audio(tenant_id, document_version, input_file, output_file, model # Download the audio file from MinIO audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language, - document_version.id, input_file) + document_version.id, input_file) # Load the audio data into pydub audio = AudioSegment.from_mp3(io.BytesIO(audio_data)) @@ -649,6 +766,3 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars): return actual_chunks pass - - - diff --git a/integrations/Wordpress/eveai-chat-widget.zip b/integrations/Wordpress/eveai-chat-widget.zip index 921f14f..dce62c3 100644 Binary files a/integrations/Wordpress/eveai-chat-widget.zip and b/integrations/Wordpress/eveai-chat-widget.zip differ diff --git a/integrations/Wordpress/eveai-chat-widget/eveai-chat_plugin.php b/integrations/Wordpress/eveai-chat-widget/eveai-chat_plugin.php index 3d2b6f5..575a5d6 100644 --- a/integrations/Wordpress/eveai-chat-widget/eveai-chat_plugin.php +++ b/integrations/Wordpress/eveai-chat-widget/eveai-chat_plugin.php @@ -3,7 +3,7 @@ Plugin Name: EveAI Chat Widget Plugin URI: https://askeveai.com/ Description: Integrates the EveAI chat interface into your WordPress site. -Version: 1.3.21 +Version: 1.3.23 Author: Josako, Pieter Laroy Author URI: https://askeveai.com/about/ */ diff --git a/integrations/Wordpress/eveai-chat-widget/js/eveai-chat-widget.js b/integrations/Wordpress/eveai-chat-widget/js/eveai-chat-widget.js index 0eef8eb..b1b1bae 100644 --- a/integrations/Wordpress/eveai-chat-widget/js/eveai-chat-widget.js +++ b/integrations/Wordpress/eveai-chat-widget/js/eveai-chat-widget.js @@ -161,24 +161,32 @@ class EveAIChatWidget extends HTMLElement { this.socket.on('connect', (data) => { console.log('Socket connected OK'); + console.log('Connect event data:', data); + console.log('Connect event this:', this); this.setStatusMessage('Connected to EveAI.'); this.updateConnectionStatus(true); this.startHeartbeat(); - if (data.room) { + if (data && data.room) { this.room = data.room; console.log(`Joined room: ${this.room}`); + } else { + console.log('Room information not received on connect'); } }); this.socket.on('authenticated', (data) => { - console.log('Authenticated event received: ', data); + console.log('Authenticated event received'); + console.log('Authentication event data:', data); + console.log('Authentication event this:', this); this.setStatusMessage('Authenticated.'); - if (data.token) { - this.jwtToken = data.token; // Store the JWT token received from the server + if (data && data.token) { + this.jwtToken = data.token; } - if (data.room) { + if (data && data.room) { this.room = data.room; console.log(`Confirmed room: ${this.room}`); + } else { + console.log('Room information not received on authentication'); } }); diff --git a/nginx/static/assets/css/eveai.css b/nginx/static/assets/css/eveai.css index 0bd24f0..ee97655 100644 --- a/nginx/static/assets/css/eveai.css +++ b/nginx/static/assets/css/eveai.css @@ -386,6 +386,7 @@ input[type="radio"] { .btn-danger:hover { background-color: darken(var(--bs-danger), 10%) !important; /* Darken the background on hover */ border-color: darken(var(--bs-danger), 10%) !important; /* Darken the border on hover */ + color: var(--bs-white) !important; /* Ensure the text remains white and readable */ } /* Success Alert Styling */ diff --git a/requirements.txt b/requirements.txt index c1b966c..383a138 100644 --- a/requirements.txt +++ b/requirements.txt @@ -73,4 +73,6 @@ minio~=7.2.7 Werkzeug~=3.0.3 itsdangerous~=2.2.0 cryptography~=43.0.0 -graypy~=2.1.0 \ No newline at end of file +graypy~=2.1.0 + +lxml~=5.3.0