4 Commits

Author SHA1 Message Date
Josako
6062b7646c - Allow multiple instances of Evie on 1 website. Shortcode is now parametrized. 2024-08-27 10:31:33 +02:00
Josako
122d1a18df - Allow for more complex and longer PDFs to be uploaded to Evie. First implmentation of a processor for specific file types.
- Allow URLs to contain other information than just HTML information. It can alose refer to e.g. PDF-files.
2024-08-27 07:05:56 +02:00
Josako
2ca006d82c Added excluded element classes to HTML parsing to allow for more complex document parsing
Added chunking to conversion of HTML to markdown in case of large files
2024-08-22 16:41:13 +02:00
Josako
a9f9b04117 Bugfix for ResetPasswordForm in config.py 2024-08-22 07:10:30 +02:00
18 changed files with 654 additions and 193 deletions

29
.gitignore vendored
View File

@@ -12,3 +12,32 @@ docker/tenant_files/
**/.DS_Store **/.DS_Store
__pycache__ __pycache__
**/__pycache__ **/__pycache__
/.idea
*.pyc
*.pyc
common/.DS_Store
common/__pycache__/__init__.cpython-312.pyc
common/__pycache__/extensions.cpython-312.pyc
common/models/__pycache__/__init__.cpython-312.pyc
common/models/__pycache__/document.cpython-312.pyc
common/models/__pycache__/interaction.cpython-312.pyc
common/models/__pycache__/user.cpython-312.pyc
common/utils/.DS_Store
common/utils/__pycache__/__init__.cpython-312.pyc
common/utils/__pycache__/celery_utils.cpython-312.pyc
common/utils/__pycache__/nginx_utils.cpython-312.pyc
common/utils/__pycache__/security.cpython-312.pyc
common/utils/__pycache__/simple_encryption.cpython-312.pyc
common/utils/__pycache__/template_filters.cpython-312.pyc
config/.DS_Store
config/__pycache__/__init__.cpython-312.pyc
config/__pycache__/config.cpython-312.pyc
config/__pycache__/logging_config.cpython-312.pyc
eveai_app/.DS_Store
eveai_app/__pycache__/__init__.cpython-312.pyc
eveai_app/__pycache__/errors.cpython-312.pyc
eveai_chat/.DS_Store
migrations/.DS_Store
migrations/public/.DS_Store
scripts/.DS_Store
scripts/__pycache__/run_eveai_app.cpython-312.pyc

View File

@@ -35,10 +35,11 @@ class Tenant(db.Model):
html_end_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'li']) html_end_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'li'])
html_included_elements = db.Column(ARRAY(sa.String(50)), nullable=True) html_included_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
html_excluded_elements = db.Column(ARRAY(sa.String(50)), nullable=True) html_excluded_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
html_excluded_classes = db.Column(ARRAY(sa.String(200)), nullable=True)
min_chunk_size = db.Column(db.Integer, nullable=True, default=2000) min_chunk_size = db.Column(db.Integer, nullable=True, default=2000)
max_chunk_size = db.Column(db.Integer, nullable=True, default=3000) max_chunk_size = db.Column(db.Integer, nullable=True, default=3000)
# Embedding search variables # Embedding search variables
es_k = db.Column(db.Integer, nullable=True, default=5) es_k = db.Column(db.Integer, nullable=True, default=5)
es_similarity_threshold = db.Column(db.Float, nullable=True, default=0.7) es_similarity_threshold = db.Column(db.Float, nullable=True, default=0.7)
@@ -80,6 +81,7 @@ class Tenant(db.Model):
'html_end_tags': self.html_end_tags, 'html_end_tags': self.html_end_tags,
'html_included_elements': self.html_included_elements, 'html_included_elements': self.html_included_elements,
'html_excluded_elements': self.html_excluded_elements, 'html_excluded_elements': self.html_excluded_elements,
'html_excluded_classes': self.html_excluded_classes,
'min_chunk_size': self.min_chunk_size, 'min_chunk_size': self.min_chunk_size,
'max_chunk_size': self.max_chunk_size, 'max_chunk_size': self.max_chunk_size,
'es_k': self.es_k, 'es_k': self.es_k,

View File

@@ -86,6 +86,7 @@ def select_model_variables(tenant):
model_variables['html_end_tags'] = tenant.html_end_tags model_variables['html_end_tags'] = tenant.html_end_tags
model_variables['html_included_elements'] = tenant.html_included_elements model_variables['html_included_elements'] = tenant.html_included_elements
model_variables['html_excluded_elements'] = tenant.html_excluded_elements model_variables['html_excluded_elements'] = tenant.html_excluded_elements
model_variables['html_excluded_classes'] = tenant.html_excluded_classes
# Set Chunk Size variables # Set Chunk Size variables
model_variables['min_chunk_size'] = tenant.min_chunk_size model_variables['min_chunk_size'] = tenant.min_chunk_size
@@ -144,8 +145,12 @@ def select_model_variables(tenant):
default_headers=portkey_headers) default_headers=portkey_headers)
tool_calling_supported = False tool_calling_supported = False
match llm_model: match llm_model:
case 'gpt-4-turbo' | 'gpt-4o' | 'gpt-4o-mini': case 'gpt-4o' | 'gpt-4o-mini':
tool_calling_supported = True tool_calling_supported = True
PDF_chunk_size = 10000
PDF_chunk_overlap = 200
PDF_min_chunk_size = 8000
PDF_max_chunk_size = 12000
case _: case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} ' raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat model') f'error: Invalid chat model')
@@ -160,10 +165,19 @@ def select_model_variables(tenant):
model=llm_model_ext, model=llm_model_ext,
temperature=model_variables['RAG_temperature']) temperature=model_variables['RAG_temperature'])
tool_calling_supported = True tool_calling_supported = True
PDF_chunk_size = 10000
PDF_chunk_overlap = 200
PDF_min_chunk_size = 8000
PDF_max_chunk_size = 12000
case _: case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} ' raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat provider') f'error: Invalid chat provider')
model_variables['PDF_chunk_size'] = PDF_chunk_size
model_variables['PDF_chunk_overlap'] = PDF_chunk_overlap
model_variables['PDF_min_chunk_size'] = PDF_min_chunk_size
model_variables['PDF_max_chunk_size'] = PDF_max_chunk_size
if tool_calling_supported: if tool_calling_supported:
model_variables['cited_answer_cls'] = CitedAnswer model_variables['cited_answer_cls'] = CitedAnswer

View File

@@ -3,7 +3,6 @@ from datetime import timedelta
import redis import redis
from common.utils.prompt_loader import load_prompt_templates from common.utils.prompt_loader import load_prompt_templates
from eveai_app.views.security_forms import ResetPasswordForm
basedir = path.abspath(path.dirname(__file__)) basedir = path.abspath(path.dirname(__file__))
@@ -46,7 +45,6 @@ class Config(object):
SECURITY_EMAIL_SUBJECT_PASSWORD_NOTICE = 'Your Password Has Been Reset' SECURITY_EMAIL_SUBJECT_PASSWORD_NOTICE = 'Your Password Has Been Reset'
SECURITY_EMAIL_PLAINTEXT = False SECURITY_EMAIL_PLAINTEXT = False
SECURITY_EMAIL_HTML = True SECURITY_EMAIL_HTML = True
SECURITY_RESET_PASSWORD_FORM = ResetPasswordForm
# Ensure Flask-Security-Too is handling CSRF tokens when behind a proxy # Ensure Flask-Security-Too is handling CSRF tokens when behind a proxy
SECURITY_CSRF_PROTECT_MECHANISMS = ['session'] SECURITY_CSRF_PROTECT_MECHANISMS = ['session']
@@ -55,7 +53,7 @@ class Config(object):
WTF_CSRF_CHECK_DEFAULT = False WTF_CSRF_CHECK_DEFAULT = False
# file upload settings # file upload settings
MAX_CONTENT_LENGTH = 16 * 1024 * 1024 MAX_CONTENT_LENGTH = 50 * 1024 * 1024
UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif'] UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif']
# supported languages # supported languages

View File

@@ -15,11 +15,12 @@ html_parse: |
pdf_parse: | pdf_parse: |
You are a top administrative aid specialized in transforming given PDF-files into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system. You are a top administrative aid specialized in transforming given PDF-files into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
The content you get is already processed (some markdown already generated), but needs to be corrected. For large files, you may receive only portions of the full file. Consider this when processing the content.
# Best practices are: # Best practices are:
- Respect wordings and language(s) used in the PDF. - Respect wordings and language(s) used in the provided content.
- The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected. - The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
- When headings are numbered, show the numbering and define the header level. - When headings are numbered, show the numbering and define the header level. You may have to correct current header levels, as preprocessing is known to make errors.
- A new item is started when a <return> is found before a full line is reached. In order to know the number of characters in a line, please check the document and the context within the document (e.g. an image could limit the number of characters temporarily). - A new item is started when a <return> is found before a full line is reached. In order to know the number of characters in a line, please check the document and the context within the document (e.g. an image could limit the number of characters temporarily).
- Paragraphs are to be stripped of newlines so they become easily readable. - Paragraphs are to be stripped of newlines so they become easily readable.
- Be careful of encoding of the text. Everything needs to be human readable. - Be careful of encoding of the text. Everything needs to be human readable.

View File

@@ -17,6 +17,7 @@ from .errors import register_error_handlers
from common.utils.celery_utils import make_celery, init_celery from common.utils.celery_utils import make_celery, init_celery
from common.utils.template_filters import register_filters from common.utils.template_filters import register_filters
from config.config import get_config from config.config import get_config
from eveai_app.views.security_forms import ResetPasswordForm
def create_app(config_file=None): def create_app(config_file=None):
@@ -37,6 +38,7 @@ def create_app(config_file=None):
app.config.from_object(get_config('dev')) app.config.from_object(get_config('dev'))
app.config['SESSION_KEY_PREFIX'] = 'eveai_app_' app.config['SESSION_KEY_PREFIX'] = 'eveai_app_'
app.config['SECURITY_RESET_PASSWORD_FORM'] = ResetPasswordForm
try: try:
os.makedirs(app.instance_path) os.makedirs(app.instance_path)

View File

@@ -12,7 +12,7 @@ from werkzeug.utils import secure_filename
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
import requests import requests
from requests.exceptions import SSLError from requests.exceptions import SSLError
from urllib.parse import urlparse from urllib.parse import urlparse, unquote
import io import io
from minio.error import S3Error from minio.error import S3Error
@@ -89,36 +89,60 @@ def add_document():
def add_url(): def add_url():
form = AddURLForm() form = AddURLForm()
# If the form is submitted
if form.validate_on_submit(): if form.validate_on_submit():
current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}') current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}')
url = form.url.data url = form.url.data
doc_vers = DocumentVersion.query.filter_by(url=url).all() try:
if doc_vers: response = requests.head(url, allow_redirects=True)
current_app.logger.info(f'A document with url {url} already exists. No new document created.') content_type = response.headers.get('Content-Type', '').split(';')[0]
flash(f'A document with url {url} already exists. No new document created.', 'info')
return redirect(prefixed_url_for('document_bp.documents'))
# Only when no document with URL exists
html = fetch_html(url)
file = io.BytesIO(html)
# Determine file extension based on Content-Type
extension = get_extension_from_content_type(content_type)
# Generate filename
parsed_url = urlparse(url) parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/') path = unquote(parsed_url.path)
filename = path_parts[-1] filename = os.path.basename(path)
if filename == '':
filename = 'index' if not filename or '.' not in filename:
if not filename.endswith('.html'): # Use the last part of the path or a default name
filename += '.html' filename = path.strip('/').split('/')[-1] or 'document'
extension = 'html' filename = secure_filename(f"{filename}.{extension}")
else:
filename = secure_filename(filename)
# Check if a document with this URL already exists
existing_doc = DocumentVersion.query.filter_by(url=url).first()
if existing_doc:
flash(f'A document with URL {url} already exists. No new document created.', 'info')
return redirect(prefixed_url_for('document_bp.documents'))
# Download the content
response = requests.get(url)
response.raise_for_status()
file_content = response.content
# Create document and document version
form_dict = form_to_dict(form) form_dict = form_to_dict(form)
new_doc, new_doc_vers = create_document_stack(form_dict, file_content, filename, extension)
new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension) # Upload file to storage
minio_client.upload_document_file(
session['tenant']['id'],
new_doc_vers.doc_id,
new_doc_vers.language,
new_doc_vers.id,
filename,
file_content
)
# Start embedding task
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[ task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'], session['tenant']['id'],
new_doc_vers.id, new_doc_vers.id,
]) ])
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, ' current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. ' f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}') f'Embedding creation task: {task.id}')
@@ -126,12 +150,32 @@ def add_url():
'success') 'success')
return redirect(prefixed_url_for('document_bp.documents')) return redirect(prefixed_url_for('document_bp.documents'))
else:
form_validation_failed(request, form) except requests.RequestException as e:
current_app.logger.error(f'Error fetching URL {url}: {str(e)}')
flash(f'Error fetching URL: {str(e)}', 'danger')
except SQLAlchemyError as e:
current_app.logger.error(f'Database error: {str(e)}')
flash('An error occurred while saving the document.', 'danger')
except Exception as e:
current_app.logger.error(f'Unexpected error: {str(e)}')
flash('An unexpected error occurred.', 'danger')
return render_template('document/add_url.html', form=form) return render_template('document/add_url.html', form=form)
def get_extension_from_content_type(content_type):
content_type_map = {
'text/html': 'html',
'application/pdf': 'pdf',
'text/plain': 'txt',
'application/msword': 'doc',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
# Add more mappings as needed
}
return content_type_map.get(content_type, 'html') # Default to 'html' if unknown
@document_bp.route('/add_urls', methods=['GET', 'POST']) @document_bp.route('/add_urls', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin') @roles_accepted('Super User', 'Tenant Admin')
def add_urls(): def add_urls():
@@ -358,6 +402,8 @@ def handle_document_version_selection():
action = request.form['action'] action = request.form['action']
current_app.logger.debug(f'Triggered Document Version Action: {action}')
match action: match action:
case 'edit_document_version': case 'edit_document_version':
return redirect(prefixed_url_for('document_bp.edit_document_version', document_version_id=doc_vers_id)) return redirect(prefixed_url_for('document_bp.edit_document_version', document_version_id=doc_vers_id))

View File

@@ -32,6 +32,7 @@ class TenantForm(FlaskForm):
default='p, li') default='p, li')
html_included_elements = StringField('HTML Included Elements', validators=[Optional()]) html_included_elements = StringField('HTML Included Elements', validators=[Optional()])
html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()]) html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()])
html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()])
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000) min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000) max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000)
# Embedding Search variables # Embedding Search variables

View File

@@ -68,6 +68,8 @@ def tenant():
if form.html_included_elements.data else [] if form.html_included_elements.data else []
new_tenant.html_excluded_elements = [tag.strip() for tag in form.html_excluded_elements.data.split(',')] \ new_tenant.html_excluded_elements = [tag.strip() for tag in form.html_excluded_elements.data.split(',')] \
if form.html_excluded_elements.data else [] if form.html_excluded_elements.data else []
new_tenant.html_excluded_classes = [cls.strip() for cls in form.html_excluded_classes.data.split(',')] \
if form.html_excluded_classes.data else []
current_app.logger.debug(f'html_tags: {new_tenant.html_tags},' current_app.logger.debug(f'html_tags: {new_tenant.html_tags},'
f'html_end_tags: {new_tenant.html_end_tags},' f'html_end_tags: {new_tenant.html_end_tags},'
@@ -123,6 +125,8 @@ def edit_tenant(tenant_id):
form.html_included_elements.data = ', '.join(tenant.html_included_elements) form.html_included_elements.data = ', '.join(tenant.html_included_elements)
if tenant.html_excluded_elements: if tenant.html_excluded_elements:
form.html_excluded_elements.data = ', '.join(tenant.html_excluded_elements) form.html_excluded_elements.data = ', '.join(tenant.html_excluded_elements)
if tenant.html_excluded_classes:
form.html_excluded_classes.data = ', '.join(tenant.html_excluded_classes)
if form.validate_on_submit(): if form.validate_on_submit():
# Populate the tenant with form data # Populate the tenant with form data
@@ -134,6 +138,8 @@ def edit_tenant(tenant_id):
elem.strip()] elem.strip()]
tenant.html_excluded_elements = [elem.strip() for elem in form.html_excluded_elements.data.split(',') if tenant.html_excluded_elements = [elem.strip() for elem in form.html_excluded_elements.data.split(',') if
elem.strip()] elem.strip()]
tenant.html_excluded_classes = [elem.strip() for elem in form.html_excluded_classes.data.split(',') if
elem.strip()]
db.session.commit() db.session.commit()
flash('Tenant updated successfully.', 'success') flash('Tenant updated successfully.', 'success')

View File

@@ -0,0 +1,271 @@
import io
import pdfplumber
from flask import current_app
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
class PDFProcessor:
def __init__(self, tenant, model_variables, document_version):
self.tenant = tenant
self.model_variables = model_variables
self.document_version = document_version
# Configuration parameters from model_variables
self.chunk_size = model_variables['PDF_chunk_size']
self.chunk_overlap = model_variables['PDF_chunk_overlap']
self.min_chunk_size = model_variables['PDF_min_chunk_size']
self.max_chunk_size = model_variables['PDF_max_chunk_size']
# Set tuning variable for easy use
self.embed_tuning = model_variables['embed_tuning']
def process_pdf(self):
self._log("Starting PDF processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
self.document_version.file_name
)
extracted_content = self._extract_content(file_data)
structured_content, title = self._structure_content(extracted_content)
llm_chunks = self._split_content_for_llm(structured_content)
markdown = self._process_chunks_with_llm(llm_chunks)
self._save_markdown(markdown)
self._log("Finished processing PDF")
return markdown, title
except Exception as e:
self._log(f"Error processing PDF: {str(e)}", level='error')
raise
def _log(self, message, level='debug'):
logger = current_app.logger
log_method = getattr(logger, level)
log_method(f"PDFProcessor - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
def _extract_content(self, file_data):
extracted_content = []
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
figure_counter = 1
for page_num, page in enumerate(pdf.pages):
self._log(f"Extracting content from page {page_num + 1}")
page_content = {
'text': page.extract_text(),
'figures': self._extract_figures(page, page_num, figure_counter),
'tables': self._extract_tables(page)
}
if self.embed_tuning:
self._log(f'Extracted PDF Content for page {page_num + 1}')
self._log(f"{page_content }")
figure_counter += len(page_content['figures'])
extracted_content.append(page_content)
# if self.embed_tuning:
# current_app.embed_tuning_logger.debug(f'Extracted PDF Content')
# current_app.embed_tuning_logger.debug(f'---------------------')
# current_app.embed_tuning_logger.debug(f'Page: {page_content}')
# current_app.embed_tuning_logger.debug(f'End of Extracted PDF Content')
# current_app.embed_tuning_logger.debug(f'----------------------------')
return extracted_content
def _extract_figures(self, page, page_num, figure_counter):
figures = []
# Omit figure processing for now!
# for img in page.images:
# try:
# # Try to get the bbox, use full page dimensions if not available
# bbox = img.get('bbox', (0, 0, page.width, page.height))
#
# figure = {
# 'figure_number': figure_counter,
# 'filename': f"figure_{page_num + 1}_{figure_counter}.png",
# 'caption': self._find_figure_caption(page, bbox)
# }
#
# # Extract the figure as an image
# figure_image = page.within_bbox(bbox).to_image()
#
# # Save the figure using MinIO
# with io.BytesIO() as output:
# figure_image.save(output, format='PNG')
# output.seek(0)
# minio_client.upload_document_file(
# self.tenant.id,
# self.document_version.doc_id,
# self.document_version.language,
# self.document_version.id,
# figure['filename'],
# output.getvalue()
# )
#
# figures.append(figure)
# figure_counter += 1
# except Exception as e:
# self._log(f"Error processing figure on page {page_num + 1}: {str(e)}", level='error')
return figures
def _find_figure_caption(self, page, bbox):
try:
# Look for text below the figure
caption_bbox = (bbox[0], bbox[3], bbox[2], min(bbox[3] + 50, page.height))
caption_text = page.crop(caption_bbox).extract_text()
if caption_text and caption_text.lower().startswith('figure'):
return caption_text
except Exception as e:
self._log(f"Error finding figure caption: {str(e)}", level='error')
return None
def _extract_tables(self, page):
tables = []
try:
for table in page.extract_tables():
if table:
markdown_table = self._table_to_markdown(table)
if markdown_table: # Only add non-empty tables
tables.append(markdown_table)
except Exception as e:
self._log(f"Error extracting tables from page: {str(e)}", level='error')
return tables
def _table_to_markdown(self, table):
if not table or not table[0]: # Check if table is empty or first row is empty
return "" # Return empty string for empty tables
def clean_cell(cell):
if cell is None:
return "" # Convert None to empty string
return str(cell).replace("|", "\\|") # Escape pipe characters and convert to string
header = [clean_cell(cell) for cell in table[0]]
markdown = "| " + " | ".join(header) + " |\n"
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
for row in table[1:]:
cleaned_row = [clean_cell(cell) for cell in row]
markdown += "| " + " | ".join(cleaned_row) + " |\n"
return markdown
def _structure_content(self, extracted_content):
structured_content = ""
title = "Untitled Document"
current_heading_level = 0
heading_pattern = re.compile(r'^(\d+(\.\d+)*\.?\s*)?(.+)$')
def identify_heading(text):
match = heading_pattern.match(text.strip())
if match:
numbering, _, content = match.groups()
if numbering:
level = numbering.count('.') + 1
return level, f"{numbering}{content}"
else:
return 1, content # Assume it's a top-level heading if no numbering
return 0, text # Not a heading
for page in extracted_content:
# Assume the title is on the first page
if page == extracted_content[0]:
lines = page.get('text', '').split('\n')
if lines:
title = lines[0].strip() # Use the first non-empty line as the title
# Process text
paragraphs = page['text'].split('\n\n')
for para in paragraphs:
lines = para.strip().split('\n')
if len(lines) == 1: # Potential heading
level, text = identify_heading(lines[0])
if level > 0:
heading_marks = '#' * level
structured_content += f"\n\n{heading_marks} {text}\n\n"
if level == 1 and not title:
title = text # Use the first top-level heading as the title if not set
else:
structured_content += f"{para}\n\n" # Treat as normal paragraph
else:
structured_content += f"{para}\n\n" # Multi-line paragraph
# Process figures
for figure in page.get('figures', []):
structured_content += f"\n\n![Figure {figure['figure_number']}]({figure['filename']})\n\n"
if figure['caption']:
structured_content += f"*Figure {figure['figure_number']}: {figure['caption']}*\n\n"
# Add tables
if 'tables' in page:
for table in page['tables']:
structured_content += f"\n{table}\n"
if self.embed_tuning:
self._save_intermediate(structured_content, "structured_content.md")
return structured_content, title
def _split_content_for_llm(self, content):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
return text_splitter.split_text(content)
def _process_chunks_with_llm(self, chunks):
llm = self.model_variables['llm']
template = self.model_variables['pdf_parse_template']
pdf_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | pdf_prompt | llm | output_parser
markdown_chunks = []
for chunk in chunks:
input = {"pdf_content": chunk}
result = chain.invoke(input)
# Remove Markdown code block delimiters if present
result = result.strip()
if result.startswith("```markdown"):
result = result[len("```markdown"):].strip()
if result.endswith("```"):
result = result[:-3].strip()
markdown_chunks.append(result)
return "\n\n".join(markdown_chunks)
def _save_markdown(self, markdown):
markdown_filename = f"{self.document_version.id}.md"
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
markdown_filename,
markdown.encode('utf-8')
)
def _save_intermediate(self, content, filename):
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
filename,
content.encode('utf-8')
)

View File

@@ -3,7 +3,6 @@ import os
from datetime import datetime as dt, timezone as tz from datetime import datetime as dt, timezone as tz
import subprocess import subprocess
import gevent import gevent
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import html import html
@@ -12,6 +11,7 @@ from flask import current_app
# OpenAI imports # OpenAI imports
from langchain.chains.summarize import load_summarize_chain from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from langchain_core.exceptions import LangChainException from langchain_core.exceptions import LangChainException
from langchain_core.output_parsers import StrOutputParser from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts import ChatPromptTemplate
@@ -29,6 +29,7 @@ from common.utils.celery_utils import current_celery
from common.utils.database import Database from common.utils.database import Database
from common.utils.model_utils import select_model_variables, create_language_template from common.utils.model_utils import select_model_variables, create_language_template
from common.utils.os_utils import safe_remove, sync_folder from common.utils.os_utils import safe_remove, sync_folder
from eveai_workers.Processors.PDF_Processor import PDFProcessor
@current_celery.task(name='create_embeddings', queue='embeddings') @current_celery.task(name='create_embeddings', queue='embeddings')
@@ -103,33 +104,67 @@ def create_embeddings(tenant_id, document_version_id):
raise raise
# def process_pdf(tenant, model_variables, document_version):
# file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
# document_version.id, document_version.file_name)
#
# pdf_text = ''
# pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
# for page in pdf_reader.pages:
# pdf_text += page.extract_text()
#
# markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
# markdown_file_name = f'{document_version.id}.md'
# minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
# document_version.id,
# markdown_file_name, markdown.encode())
#
# potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
# chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
# model_variables['max_chunk_size'])
#
# if len(chunks) > 1:
# summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
# document_version.system_context = f'Summary: {summary}\n'
# else:
# document_version.system_context = ''
#
# enriched_chunks = enrich_chunks(tenant, document_version, chunks)
# embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
#
# try:
# db.session.add(document_version)
# document_version.processing_finished_at = dt.now(tz.utc)
# document_version.processing = False
# db.session.add_all(embeddings)
# db.session.commit()
# except SQLAlchemyError as e:
# current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
# f'on HTML, document version {document_version.id}'
# f'error: {e}')
# raise
#
# current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
# f'on document version {document_version.id} :-)')
def process_pdf(tenant, model_variables, document_version): def process_pdf(tenant, model_variables, document_version):
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language, processor = PDFProcessor(tenant, model_variables, document_version)
document_version.id, document_version.file_name) markdown, title = processor.process_pdf()
pdf_text = '' # Create potential chunks for embedding
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data)) potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
for page in pdf_reader.pages:
pdf_text += page.extract_text()
markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text) # Combine chunks for embedding
markdown_file_name = f'{document_version.id}.md'
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id,
markdown_file_name, markdown.encode())
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'], chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size']) model_variables['max_chunk_size'])
if len(chunks) > 1: # Enrich chunks
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0]) enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
document_version.system_context = f'Summary: {summary}\n'
else:
document_version.system_context = ''
enriched_chunks = enrich_chunks(tenant, document_version, chunks) # Create embeddings
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks) embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
# Update document version and save embeddings
try: try:
db.session.add(document_version) db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc) document_version.processing_finished_at = dt.now(tz.utc)
@@ -138,7 +173,7 @@ def process_pdf(tenant, model_variables, document_version):
db.session.commit() db.session.commit()
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} ' current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
f'on HTML, document version {document_version.id}' f'on PDF, document version {document_version.id}'
f'error: {e}') f'error: {e}')
raise raise
@@ -173,12 +208,14 @@ def process_html(tenant, model_variables, document_version):
excluded_elements=html_excluded_elements) excluded_elements=html_excluded_elements)
extracted_file_name = f'{document_version.id}-extracted.html' extracted_file_name = f'{document_version.id}-extracted.html'
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id, minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id,
extracted_file_name, extracted_html.encode()) extracted_file_name, extracted_html.encode())
markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html) markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
markdown_file_name = f'{document_version.id}.md' markdown_file_name = f'{document_version.id}.md'
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language, document_version.id, minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id,
markdown_file_name, markdown.encode()) markdown_file_name, markdown.encode())
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name) potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
@@ -236,18 +273,73 @@ def enrich_chunks(tenant, document_version, title, chunks):
def generate_markdown_from_html(tenant, model_variables, document_version, html_content): def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
current_app.logger.debug(f'Generating Markdown from HTML for tenant {tenant.id} ' current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}') f'on document version {document_version.id}')
llm = model_variables['llm'] llm = model_variables['llm']
template = model_variables['html_parse_template'] template = model_variables['html_parse_template']
parse_prompt = ChatPromptTemplate.from_template(template) parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough() setup = RunnablePassthrough()
output_parser = StrOutputParser() output_parser = StrOutputParser()
chain = setup | parse_prompt | llm | output_parser chain = setup | parse_prompt | llm | output_parser
input_html = {"html": html_content}
markdown = chain.invoke(input_html) soup = BeautifulSoup(html_content, 'lxml')
def split_content(soup, max_size=20000):
chunks = []
current_chunk = []
current_size = 0
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
element_html = str(element)
element_size = len(element_html)
if current_size + element_size > max_size and current_chunk:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
current_chunk.append(element)
current_size += element_size
if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
if current_chunk:
chunks.append(''.join(map(str, current_chunk)))
return chunks
chunks = split_content(soup)
markdown_chunks = []
for chunk in chunks:
current_app.logger.debug(f'Processing chunk to generate markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Processing chunk: \n '
f'------------------\n'
f'{chunk}\n'
f'------------------\n')
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Processed markdown chunk: \n '
f'-------------------------\n'
f'{markdown_chunk}\n'
f'-------------------------\n')
current_app.logger.debug(f'Finished processing chunk to generate markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
# Combine all markdown chunks
markdown = "\n\n".join(markdown_chunks)
current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
return markdown return markdown
@@ -324,36 +416,73 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None): def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
current_app.logger.debug(f'Parsing HTML for tenant {tenant.id}')
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
extracted_html = '' extracted_html = ''
excluded_classes = parse_excluded_classes(tenant.html_excluded_classes)
if included_elements: if included_elements:
elements_to_parse = soup.find_all(included_elements) elements_to_parse = soup.find_all(included_elements)
else: else:
elements_to_parse = [soup] # parse the entire document if no included_elements specified elements_to_parse = [soup]
log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse)
for element in elements_to_parse:
for sub_element in element.find_all(tags):
if should_exclude_element(sub_element, excluded_elements, excluded_classes):
continue
extracted_html += extract_element_content(sub_element)
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
current_app.logger.debug(f'Finished parsing HTML for tenant {tenant.id}')
return extracted_html, title
def parse_excluded_classes(excluded_classes):
parsed = {}
for rule in excluded_classes:
element, cls = rule.split('.', 1)
parsed.setdefault(element, set()).add(cls)
return parsed
def should_exclude_element(element, excluded_elements, excluded_classes):
if excluded_elements and element.find_parent(excluded_elements):
return True
return is_element_excluded_by_class(element, excluded_classes)
def is_element_excluded_by_class(element, excluded_classes):
for parent in element.parents:
if element_matches_exclusion(parent, excluded_classes):
return True
return element_matches_exclusion(element, excluded_classes)
def element_matches_exclusion(element, excluded_classes):
if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
return True
return element.name in excluded_classes and \
any(cls in excluded_classes[element.name] for cls in element.get('class', []))
def extract_element_content(element):
content = ' '.join(child.strip() for child in element.stripped_strings)
return f'<{element.name}>{content}</{element.name}>\n'
def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
if tenant.embed_tuning: if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}') current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}') current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}') current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
current_app.embed_tuning_logger.debug(f'Excluded Classes: {excluded_classes}')
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse') current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}') current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
# Iterate through the found included elements
for element in elements_to_parse:
# Find all specified tags within each included element
for sub_element in element.find_all(tags):
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Found element: {sub_element.name}')
if excluded_elements and sub_element.find_parent(excluded_elements):
continue # Skip this sub_element if it's within any of the excluded_elements
extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}</{sub_element.name}>\n'
title = soup.find('title').get_text(strip=True)
return extracted_html, title
def process_youtube(tenant, model_variables, document_version): def process_youtube(tenant, model_variables, document_version):
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'], base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
@@ -414,7 +543,8 @@ def download_youtube(url, tenant_id, document_version, file_name):
with open(temp_file.name, 'rb') as f: with open(temp_file.name, 'rb') as f:
file_data = f.read() file_data = f.read()
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id, minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id,
file_name, file_data) file_name, file_data)
current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}') current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
@@ -448,7 +578,8 @@ def compress_audio(tenant_id, document_version, input_file, output_file):
with open(temp_output.name, 'rb') as f: with open(temp_output.name, 'rb') as f:
compressed_data = f.read() compressed_data = f.read()
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language, document_version.id, minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id,
output_file, compressed_data) output_file, compressed_data)
current_app.logger.info(f'Compressed audio for tenant: {tenant_id}') current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
@@ -648,7 +779,4 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
actual_chunks.append(current_chunk) actual_chunks.append(current_chunk)
return actual_chunks return actual_chunks
pass

View File

@@ -3,7 +3,7 @@
Plugin Name: EveAI Chat Widget Plugin Name: EveAI Chat Widget
Plugin URI: https://askeveai.com/ Plugin URI: https://askeveai.com/
Description: Integrates the EveAI chat interface into your WordPress site. Description: Integrates the EveAI chat interface into your WordPress site.
Version: 1.3.21 Version: 1.4.1
Author: Josako, Pieter Laroy Author: Josako, Pieter Laroy
Author URI: https://askeveai.com/about/ Author URI: https://askeveai.com/about/
*/ */
@@ -18,15 +18,28 @@ function eveai_chat_enqueue_scripts() {
wp_enqueue_style('eveai-chat-style', plugin_dir_url(__FILE__) . 'css/eveai-chat-style.css'); wp_enqueue_style('eveai-chat-style', plugin_dir_url(__FILE__) . 'css/eveai-chat-style.css');
} }
add_action('wp_enqueue_scripts', 'eveai_chat_enqueue_scripts'); add_action('wp_enqueue_scripts', 'eveai_chat_enqueue_scripts');
add_action('admin_enqueue_scripts', 'eveai_chat_enqueue_scripts');
// Shortcode function // Shortcode function
function eveai_chat_shortcode($atts) { function eveai_chat_shortcode($atts) {
$options = get_option('eveai_chat_options'); // Default values
$tenant_id = esc_js($options['tenant_id']); $defaults = array(
$api_key = esc_js($options['api_key']); 'tenant_id' => '',
$domain = esc_js($options['domain']); 'api_key' => '',
$language = esc_js($options['language']); 'domain' => '',
$supported_languages = esc_js($options['supported_languages']); 'language' => 'en',
'supported_languages' => 'en,fr,de,es'
);
// Merge provided attributes with defaults
$atts = shortcode_atts($defaults, $atts, 'eveai_chat');
// Sanitize inputs
$tenant_id = sanitize_text_field($atts['tenant_id']);
$api_key = sanitize_text_field($atts['api_key']);
$domain = esc_url_raw($atts['domain']);
$language = sanitize_text_field($atts['language']);
$supported_languages = sanitize_text_field($atts['supported_languages']);
// Generate a unique ID for this instance of the chat widget // Generate a unique ID for this instance of the chat widget
$chat_id = 'chat-container-' . uniqid(); $chat_id = 'chat-container-' . uniqid();
@@ -49,80 +62,3 @@ function eveai_chat_shortcode($atts) {
} }
add_shortcode('eveai_chat', 'eveai_chat_shortcode'); add_shortcode('eveai_chat', 'eveai_chat_shortcode');
// Add admin menu
function eveai_chat_admin_menu() {
add_options_page('EveAI Chat Settings', 'EveAI Chat', 'manage_options', 'eveai-chat-settings', 'eveai_chat_settings_page');
}
add_action('admin_menu', 'eveai_chat_admin_menu');
// Settings page
function eveai_chat_settings_page() {
?>
<div class="wrap">
<h1>EveAI Chat Settings</h1>
<form method="post" action="options.php">
<?php
settings_fields('eveai_chat_options');
do_settings_sections('eveai-chat-settings');
submit_button();
?>
</form>
</div>
<?php
}
// Register settings
function eveai_chat_register_settings() {
register_setting('eveai_chat_options', 'eveai_chat_options', 'eveai_chat_options_validate');
add_settings_section('eveai_chat_main', 'Main Settings', 'eveai_chat_section_text', 'eveai-chat-settings');
add_settings_field('eveai_chat_tenant_id', 'Tenant ID', 'eveai_chat_tenant_id_input', 'eveai-chat-settings', 'eveai_chat_main');
add_settings_field('eveai_chat_api_key', 'API Key', 'eveai_chat_api_key_input', 'eveai-chat-settings', 'eveai_chat_main');
add_settings_field('eveai_chat_domain', 'Domain', 'eveai_chat_domain_input', 'eveai-chat-settings', 'eveai_chat_main');
add_settings_field('eveai_chat_language', 'Default Language', 'eveai_chat_language_input', 'eveai-chat-settings', 'eveai_chat_main');
add_settings_field('eveai_chat_supported_languages', 'Supported Languages', 'eveai_chat_supported_languages_input', 'eveai-chat-settings', 'eveai_chat_main');
}
add_action('admin_init', 'eveai_chat_register_settings');
function eveai_chat_section_text() {
echo '<p>Enter your EveAI Chat configuration details below:</p>';
}
function eveai_chat_tenant_id_input() {
$options = get_option('eveai_chat_options');
echo "<input id='eveai_chat_tenant_id' name='eveai_chat_options[tenant_id]' type='text' value='" . esc_attr($options['tenant_id']) . "' />";
}
function eveai_chat_api_key_input() {
$options = get_option('eveai_chat_options');
echo "<input id='eveai_chat_api_key' name='eveai_chat_options[api_key]' type='password' value='" . esc_attr($options['api_key']) . "' />";
}
function eveai_chat_domain_input() {
$options = get_option('eveai_chat_options');
echo "<input id='eveai_chat_domain' name='eveai_chat_options[domain]' type='text' value='" . esc_attr($options['domain']) . "' />";
}
function eveai_chat_language_input() {
$options = get_option('eveai_chat_options');
echo "<input id='eveai_chat_language' name='eveai_chat_options[language]' type='text' value='" . esc_attr($options['language']) . "' />";
}
function eveai_chat_supported_languages_input() {
$options = get_option('eveai_chat_options');
$supported_languages = isset($options['supported_languages']) ? $options['supported_languages'] : 'en,fr,de,es';
echo "<input id='eveai_chat_supported_languages' name='eveai_chat_options[supported_languages]' type='text' value='" . esc_attr($supported_languages) . "' />";
echo "<p class='description'>Enter comma-separated language codes (e.g., en,fr,de,es)</p>";
}
function eveai_chat_options_validate($input) {
$new_input = array();
$new_input['tenant_id'] = sanitize_text_field($input['tenant_id']);
$new_input['api_key'] = sanitize_text_field($input['api_key']);
$new_input['domain'] = esc_url_raw($input['domain']);
$new_input['language'] = sanitize_text_field($input['language']);
$new_input['supported_languages'] = sanitize_text_field($input['supported_languages']);
return $new_input;
}

View File

@@ -161,24 +161,32 @@ class EveAIChatWidget extends HTMLElement {
this.socket.on('connect', (data) => { this.socket.on('connect', (data) => {
console.log('Socket connected OK'); console.log('Socket connected OK');
console.log('Connect event data:', data);
console.log('Connect event this:', this);
this.setStatusMessage('Connected to EveAI.'); this.setStatusMessage('Connected to EveAI.');
this.updateConnectionStatus(true); this.updateConnectionStatus(true);
this.startHeartbeat(); this.startHeartbeat();
if (data.room) { if (data && data.room) {
this.room = data.room; this.room = data.room;
console.log(`Joined room: ${this.room}`); console.log(`Joined room: ${this.room}`);
} else {
console.log('Room information not received on connect');
} }
}); });
this.socket.on('authenticated', (data) => { this.socket.on('authenticated', (data) => {
console.log('Authenticated event received: ', data); console.log('Authenticated event received');
console.log('Authentication event data:', data);
console.log('Authentication event this:', this);
this.setStatusMessage('Authenticated.'); this.setStatusMessage('Authenticated.');
if (data.token) { if (data && data.token) {
this.jwtToken = data.token; // Store the JWT token received from the server this.jwtToken = data.token;
} }
if (data.room) { if (data && data.room) {
this.room = data.room; this.room = data.room;
console.log(`Confirmed room: ${this.room}`); console.log(`Confirmed room: ${this.room}`);
} else {
console.log('Room information not received on authentication');
} }
}); });

View File

@@ -3,7 +3,7 @@ Contributors: Josako
Tags: chat, ai Tags: chat, ai
Requires at least: 5.0 Requires at least: 5.0
Tested up to: 5.9 Tested up to: 5.9
Stable tag: 1.3.0 Stable tag: 1.4.1
License: GPLv2 or later License: GPLv2 or later
License URI: http://www.gnu.org/licenses/gpl-2.0.html License URI: http://www.gnu.org/licenses/gpl-2.0.html
@@ -17,7 +17,18 @@ This plugin allows you to easily add the EveAI chat widget to your WordPress sit
1. Upload the `eveai-chat-widget` folder to the `/wp-content/plugins/` directory 1. Upload the `eveai-chat-widget` folder to the `/wp-content/plugins/` directory
2. Activate the plugin through the 'Plugins' menu in WordPress 2. Activate the plugin through the 'Plugins' menu in WordPress
3. Go to Settings > EveAI Chat to configure your chat widget parameters 3. Add EveAI Chat Widget to your page or post using the instructions below.
== Usage ==
To add an EveAI Chat Widget to your page or post, use the following shortcode:
[eveai_chat tenant_id="YOUR_TENANT_ID" api_key="YOUR_API_KEY" domain="YOUR_DOMAIN" language="LANGUAGE_CODE" supported_languages="COMMA_SEPARATED_LANGUAGE_CODES"]
Example:
[eveai_chat tenant_id="123456" api_key="your_api_key_here" domain="https://your-domain.com" language="en" supported_languages="en,fr,de,es"]
You can add multiple chat widgets with different configurations by using the shortcode multiple times with different parameters.
== Frequently Asked Questions == == Frequently Asked Questions ==
@@ -27,6 +38,13 @@ Contact your EveAI service provider to obtain your Tenant ID, API Key, and Domai
== Changelog == == Changelog ==
= 1.4.1 - 1.4...=
* Bug fixes
= 1.4.0 =
* Allow for multiple instances of Evie on the same website
* Parametrization of the shortcode
= 1.3.3 - = = 1.3.3 - =
* ensure all attributes (also height and supportedLanguages) are set before initializing the socket * ensure all attributes (also height and supportedLanguages) are set before initializing the socket
* Bugfixing * Bugfixing

View File

@@ -32,7 +32,7 @@ http {
#keepalive_timeout 0; #keepalive_timeout 0;
keepalive_timeout 65; keepalive_timeout 65;
client_max_body_size 16M; client_max_body_size 50M;
#gzip on; #gzip on;

View File

@@ -386,6 +386,7 @@ input[type="radio"] {
.btn-danger:hover { .btn-danger:hover {
background-color: darken(var(--bs-danger), 10%) !important; /* Darken the background on hover */ background-color: darken(var(--bs-danger), 10%) !important; /* Darken the background on hover */
border-color: darken(var(--bs-danger), 10%) !important; /* Darken the border on hover */ border-color: darken(var(--bs-danger), 10%) !important; /* Darken the border on hover */
color: var(--bs-white) !important; /* Ensure the text remains white and readable */
} }
/* Success Alert Styling */ /* Success Alert Styling */

View File

@@ -43,7 +43,6 @@ pgvector~=0.2.5
pycryptodome~=3.20.0 pycryptodome~=3.20.0
pydantic~=2.7.4 pydantic~=2.7.4
PyJWT~=2.8.0 PyJWT~=2.8.0
pypdf~=4.2.0
PySocks~=1.7.1 PySocks~=1.7.1
python-dateutil~=2.9.0.post0 python-dateutil~=2.9.0.post0
python-engineio~=4.9.1 python-engineio~=4.9.1
@@ -61,16 +60,17 @@ urllib3~=2.2.2
WTForms~=3.1.2 WTForms~=3.1.2
wtforms-html5~=0.6.1 wtforms-html5~=0.6.1
zxcvbn~=4.4.28 zxcvbn~=4.4.28
pytube~=15.0.0 pytube~=15.0.0
PyPDF2~=3.0.1
groq~=0.9.0 groq~=0.9.0
pydub~=0.25.1 pydub~=0.25.1
argparse~=1.4.0 argparse~=1.4.0
portkey_ai~=1.7.0 portkey_ai~=1.8.2
minio~=7.2.7 minio~=7.2.7
Werkzeug~=3.0.3 Werkzeug~=3.0.3
itsdangerous~=2.2.0 itsdangerous~=2.2.0
cryptography~=43.0.0 cryptography~=43.0.0
graypy~=2.1.0 graypy~=2.1.0
lxml~=5.3.0
pillow~=10.4.0
pdfplumber~=0.11.4
PyPDF2~=3.0.1