- Move from OpenAI to Mistral Embeddings

- Move embedding model settings from tenant to catalog
- BUG: error processing configuration for chunking patterns in HTML_PROCESSOR
- Removed eveai_chat from docker-files and nginx configuration, as it is now obsolete
- BUG: error in Library Operations when creating a new default RAG library
- BUG: Added public type in migration scripts
- Removed SocketIO from all code and requirements.txt
This commit is contained in:
Josako
2025-02-25 11:17:19 +01:00
parent c037d4135e
commit 55a89c11bb
34 changed files with 457 additions and 444 deletions

View File

@@ -1,7 +1,7 @@
import logging
import os
from flask import Flask, render_template, jsonify, flash, redirect, request
from flask_security import SQLAlchemyUserDatastore, LoginForm
from flask import Flask, jsonify
from flask_security import SQLAlchemyUserDatastore
from flask_security.signals import user_authenticated
from werkzeug.middleware.proxy_fix import ProxyFix
import logging.config
@@ -12,7 +12,6 @@ from common.models.user import User, Role, Tenant, TenantDomain
import common.models.interaction
import common.models.entitlements
import common.models.document
from common.utils.nginx_utils import prefixed_url_for
from common.utils.startup_eveai import perform_startup_actions
from config.logging_config import LOGGING
from common.utils.security import set_tenant_session_data

View File

@@ -11,7 +11,7 @@ When you change chunking of embedding information, you'll need to manually refre
{% block content %}
<form method="post">
{{ form.hidden_tag() }}
{% set disabled_fields = ['type'] %}
{% set disabled_fields = ['type', 'embedding_model'] %}
{% set exclude_fields = [] %}
<!-- Render Static Fields -->
{% for field in form.get_static_fields() %}

View File

@@ -9,7 +9,7 @@
{% block content %}
<form method="post">
{{ form.hidden_tag() }}
{% set disabled_fields = ['name', 'embedding_model', 'llm_model'] %}
{% set disabled_fields = ['name', 'llm_model'] %}
{% set exclude_fields = [] %}
{% for field in form %}
{{ render_field(field, disabled_fields, exclude_fields) }}

View File

@@ -35,7 +35,7 @@
<div class="tab-content tab-space">
<!-- Model Information Tab -->
<div class="tab-pane fade show active" id="model-info-tab" role="tabpanel">
{% set model_fields = ['embedding_model', 'llm_model'] %}
{% set model_fields = ['llm_model'] %}
{% for field in form %}
{{ render_included_field(field, disabled_fields=model_fields, include_fields=model_fields) }}
{% endfor %}

View File

@@ -1,6 +1,7 @@
from flask import session, current_app
from flask_wtf import FlaskForm
from wtforms import (StringField, BooleanField, SubmitField, DateField, IntegerField, SelectField, TextAreaField, URLField)
from wtforms import (StringField, BooleanField, SubmitField, DateField, IntegerField, SelectField, TextAreaField,
URLField)
from wtforms.validators import DataRequired, Length, Optional, URL, ValidationError, NumberRange
from flask_wtf.file import FileField, FileRequired
import json
@@ -30,10 +31,13 @@ class CatalogForm(FlaskForm):
# Select Field for Catalog Type (Uses the CATALOG_TYPES defined in config)
type = SelectField('Catalog Type', validators=[DataRequired()])
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()],
default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
default=3000)
# Selection fields for processing & creating embeddings
embedding_model = SelectField('Embedding Model', choices=[], validators=[DataRequired()])
min_chunk_size = IntegerField('Minimum Chunk Size (1500)', validators=[NumberRange(min=0), Optional()],
default=1500)
max_chunk_size = IntegerField('Maximum Chunk Size (2500)', validators=[NumberRange(min=0), Optional()],
default=2500)
# Metadata fields
user_metadata = TextAreaField('User Metadata', validators=[Optional(), validate_json])
@@ -43,6 +47,7 @@ class CatalogForm(FlaskForm):
super().__init__(*args, **kwargs)
# Dynamically populate the 'type' field using the constructor
self.type.choices = [(key, value['name']) for key, value in CATALOG_TYPES.items()]
self.embedding_model.choices = [(model, model) for model in current_app.config['SUPPORTED_EMBEDDINGS']]
class EditCatalogForm(DynamicFormBase):
@@ -52,6 +57,9 @@ class EditCatalogForm(DynamicFormBase):
# Select Field for Catalog Type (Uses the CATALOG_TYPES defined in config)
type = StringField('Catalog Type', validators=[DataRequired()], render_kw={'readonly': True})
# Selection fields for processing & creating embeddings
embedding_model = StringField('Embedding Model', validators=[DataRequired()], render_kw={'readonly': True})
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()],
default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()],
@@ -59,7 +67,7 @@ class EditCatalogForm(DynamicFormBase):
# Metadata fields
user_metadata = TextAreaField('User Metadata', validators=[Optional(), validate_json])
system_metadata = TextAreaField('System Metadata', validators=[Optional(), validate_json],)
system_metadata = TextAreaField('System Metadata', validators=[Optional(), validate_json], )
class ProcessorForm(FlaskForm):

View File

@@ -684,8 +684,9 @@ def create_default_rag_library():
name='Default RAG Catalog',
description='Default RAG Catalog',
type="STANDARD_CATALOG",
min_chunk_size=2000,
max_chunk_size=3000,
min_chunk_size=1500,
max_chunk_size=2500,
embedding_model="mistral.mistral-embed"
)
set_logging_information(cat, timestamp)
@@ -696,7 +697,7 @@ def create_default_rag_library():
name='Default HTML Processor',
description='Default HTML Processor',
catalog_id=cat.id,
type="HTML Processor",
type="HTML_PROCESSOR",
configuration={
"html_tags": "p, h1, h2, h3, h4, h5, h6, li, table, thead, tbody, tr, td",
"html_end_tags": "p, li, table",

View File

@@ -21,7 +21,6 @@ class TenantForm(FlaskForm):
# Timezone
timezone = SelectField('Timezone', choices=[], validators=[DataRequired()])
# LLM fields
embedding_model = SelectField('Embedding Model', choices=[], validators=[DataRequired()])
llm_model = SelectField('Large Language Model', choices=[], validators=[DataRequired()])
# Embedding variables
submit = SubmitField('Submit')
@@ -36,7 +35,6 @@ class TenantForm(FlaskForm):
# initialise timezone
self.timezone.choices = [(tz, tz) for tz in pytz.all_timezones]
# initialise LLM fields
self.embedding_model.choices = [(model, model) for model in current_app.config['SUPPORTED_EMBEDDINGS']]
self.llm_model.choices = [(model, model) for model in current_app.config['SUPPORTED_LLMS']]
# Initialize fallback algorithms
self.type.choices = [(t, t) for t in current_app.config['TENANT_TYPES']]

View File

@@ -228,7 +228,6 @@ def handle_tenant_selection():
# set tenant information in the session
session['tenant'] = the_tenant.to_dict()
session['default_language'] = the_tenant.default_language
session['embedding_model'] = the_tenant.embedding_model
session['llm_model'] = the_tenant.llm_model
# remove catalog-related items from the session
session.pop('catalog_id', None)