- Move from OpenAI to Mistral Embeddings

- Move embedding model settings from tenant to catalog - BUG: error processing configuration for chunking patterns in HTML_PROCESSOR - Removed eveai_chat from docker-files and nginx configuration, as it is now obsolete - BUG: error in Library Operations when creating a new default RAG library - BUG: Added public type in migration scripts - Removed SocketIO from all code and requirements.txt
2025-02-25 11:17:19 +01:00
parent c037d4135e
commit 55a89c11bb
34 changed files with 457 additions and 444 deletions
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -12,18 +12,20 @@ from langchain_core.runnables import RunnablePassthrough
 from sqlalchemy import or_
 from sqlalchemy.exc import SQLAlchemyError

-from common.extensions import db, minio_client
+from common.extensions import db
 from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
 from common.models.user import Tenant
 from common.utils.celery_utils import current_celery
 from common.utils.database import Database
-from common.utils.model_utils import create_language_template, get_model_variables
+from common.utils.model_utils import create_language_template, get_model_variables, get_embedding_model_and_class

 from common.utils.business_event import BusinessEvent
 from common.utils.business_event_context import current_event
 from config.type_defs.processor_types import PROCESSOR_TYPES
 from eveai_workers.processors.processor_registry import ProcessorRegistry

+from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
+
 from common.utils.config_field_types import json_to_pattern_list


@@ -155,7 +157,7 @@ def embed_markdown(tenant, model_variables, document_version, catalog, processor

    # Create embeddings
    with current_event.create_span("Create Embeddings"):
-        embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
+        embeddings = embed_chunks(tenant, catalog, document_version, enriched_chunks)

    # Update document version and save embeddings
    try:
@@ -227,9 +229,14 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
        raise


-def embed_chunks(tenant, model_variables, document_version, chunks):
-    embedding_model = model_variables.embedding_model
+def embed_chunks(tenant, catalog, document_version, chunks):
+    if catalog.embedding_model:
+        embedding_model, embedding_model_class = get_embedding_model_and_class(tenant.id, catalog.id,
+                                                                               catalog.embedding_model)
+    else:
+        raise EveAIInvalidEmbeddingModel(tenant.id, catalog.id)

+    # Actually embed
    try:
        embeddings = embedding_model.embed_documents(chunks)
    except LangChainException as e:
@@ -241,7 +248,7 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
    # Add embeddings to the database
    new_embeddings = []
    for chunk, embedding in zip(chunks, embeddings):
-        new_embedding = model_variables.embedding_model_class()
+        new_embedding = embedding_model_class()
        new_embedding.document_version = document_version
        new_embedding.active = True
        new_embedding.chunk = chunk
@@ -309,7 +316,7 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processo

        return False

-    chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
+    chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', ""))

    processor.log_tuning(f'Chunking Patterns Extraction: ', {
        'Full Configuration': processor.configuration,