- Move from OpenAI to Mistral Embeddings

- Move embedding model settings from tenant to catalog
- BUG: error processing configuration for chunking patterns in HTML_PROCESSOR
- Removed eveai_chat from docker-files and nginx configuration, as it is now obsolete
- BUG: error in Library Operations when creating a new default RAG library
- BUG: Added public type in migration scripts
- Removed SocketIO from all code and requirements.txt
This commit is contained in:
Josako
2025-02-25 11:17:19 +01:00
parent c037d4135e
commit 55a89c11bb
34 changed files with 457 additions and 444 deletions

View File

@@ -1,6 +1,7 @@
from flask import session, current_app
from flask_wtf import FlaskForm
from wtforms import (StringField, BooleanField, SubmitField, DateField, IntegerField, SelectField, TextAreaField, URLField)
from wtforms import (StringField, BooleanField, SubmitField, DateField, IntegerField, SelectField, TextAreaField,
URLField)
from wtforms.validators import DataRequired, Length, Optional, URL, ValidationError, NumberRange
from flask_wtf.file import FileField, FileRequired
import json
@@ -30,10 +31,13 @@ class CatalogForm(FlaskForm):
# Select Field for Catalog Type (Uses the CATALOG_TYPES defined in config)
type = SelectField('Catalog Type', validators=[DataRequired()])
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()],
default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
default=3000)
# Selection fields for processing & creating embeddings
embedding_model = SelectField('Embedding Model', choices=[], validators=[DataRequired()])
min_chunk_size = IntegerField('Minimum Chunk Size (1500)', validators=[NumberRange(min=0), Optional()],
default=1500)
max_chunk_size = IntegerField('Maximum Chunk Size (2500)', validators=[NumberRange(min=0), Optional()],
default=2500)
# Metadata fields
user_metadata = TextAreaField('User Metadata', validators=[Optional(), validate_json])
@@ -43,6 +47,7 @@ class CatalogForm(FlaskForm):
super().__init__(*args, **kwargs)
# Dynamically populate the 'type' field using the constructor
self.type.choices = [(key, value['name']) for key, value in CATALOG_TYPES.items()]
self.embedding_model.choices = [(model, model) for model in current_app.config['SUPPORTED_EMBEDDINGS']]
class EditCatalogForm(DynamicFormBase):
@@ -52,6 +57,9 @@ class EditCatalogForm(DynamicFormBase):
# Select Field for Catalog Type (Uses the CATALOG_TYPES defined in config)
type = StringField('Catalog Type', validators=[DataRequired()], render_kw={'readonly': True})
# Selection fields for processing & creating embeddings
embedding_model = StringField('Embedding Model', validators=[DataRequired()], render_kw={'readonly': True})
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()],
default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
@@ -59,7 +67,7 @@ class EditCatalogForm(DynamicFormBase):
# Metadata fields
user_metadata = TextAreaField('User Metadata', validators=[Optional(), validate_json])
system_metadata = TextAreaField('System Metadata', validators=[Optional(), validate_json],)
system_metadata = TextAreaField('System Metadata', validators=[Optional(), validate_json], )
class ProcessorForm(FlaskForm):

View File

@@ -684,8 +684,9 @@ def create_default_rag_library():
name='Default RAG Catalog',
description='Default RAG Catalog',
type="STANDARD_CATALOG",
min_chunk_size=2000,
max_chunk_size=3000,
min_chunk_size=1500,
max_chunk_size=2500,
embedding_model="mistral.mistral-embed"
)
set_logging_information(cat, timestamp)
@@ -696,7 +697,7 @@ def create_default_rag_library():
name='Default HTML Processor',
description='Default HTML Processor',
catalog_id=cat.id,
type="HTML Processor",
type="HTML_PROCESSOR",
configuration={
"html_tags": "p, h1, h2, h3, h4, h5, h6, li, table, thead, tbody, tr, td",
"html_end_tags": "p, li, table",

View File

@@ -21,7 +21,6 @@ class TenantForm(FlaskForm):
# Timezone
timezone = SelectField('Timezone', choices=[], validators=[DataRequired()])
# LLM fields
embedding_model = SelectField('Embedding Model', choices=[], validators=[DataRequired()])
llm_model = SelectField('Large Language Model', choices=[], validators=[DataRequired()])
# Embedding variables
submit = SubmitField('Submit')
@@ -36,7 +35,6 @@ class TenantForm(FlaskForm):
# initialise timezone
self.timezone.choices = [(tz, tz) for tz in pytz.all_timezones]
# initialise LLM fields
self.embedding_model.choices = [(model, model) for model in current_app.config['SUPPORTED_EMBEDDINGS']]
self.llm_model.choices = [(model, model) for model in current_app.config['SUPPORTED_LLMS']]
# Initialize fallback algorithms
self.type.choices = [(t, t) for t in current_app.config['TENANT_TYPES']]

View File

@@ -228,7 +228,6 @@ def handle_tenant_selection():
# set tenant information in the session
session['tenant'] = the_tenant.to_dict()
session['default_language'] = the_tenant.default_language
session['embedding_model'] = the_tenant.embedding_model
session['llm_model'] = the_tenant.llm_model
# remove catalog-related items from the session
session.pop('catalog_id', None)