- Move from OpenAI to Mistral Embeddings
- Move embedding model settings from tenant to catalog - BUG: error processing configuration for chunking patterns in HTML_PROCESSOR - Removed eveai_chat from docker-files and nginx configuration, as it is now obsolete - BUG: error in Library Operations when creating a new default RAG library - BUG: Added public type in migration scripts - Removed SocketIO from all code and requirements.txt
This commit is contained in:
@@ -12,18 +12,20 @@ from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
from common.extensions import db, minio_client
|
||||
from common.extensions import db
|
||||
from common.models.document import DocumentVersion, Embedding, Document, Processor, Catalog
|
||||
from common.models.user import Tenant
|
||||
from common.utils.celery_utils import current_celery
|
||||
from common.utils.database import Database
|
||||
from common.utils.model_utils import create_language_template, get_model_variables
|
||||
from common.utils.model_utils import create_language_template, get_model_variables, get_embedding_model_and_class
|
||||
|
||||
from common.utils.business_event import BusinessEvent
|
||||
from common.utils.business_event_context import current_event
|
||||
from config.type_defs.processor_types import PROCESSOR_TYPES
|
||||
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
||||
|
||||
from common.utils.eveai_exceptions import EveAIInvalidEmbeddingModel
|
||||
|
||||
from common.utils.config_field_types import json_to_pattern_list
|
||||
|
||||
|
||||
@@ -155,7 +157,7 @@ def embed_markdown(tenant, model_variables, document_version, catalog, processor
|
||||
|
||||
# Create embeddings
|
||||
with current_event.create_span("Create Embeddings"):
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
embeddings = embed_chunks(tenant, catalog, document_version, enriched_chunks)
|
||||
|
||||
# Update document version and save embeddings
|
||||
try:
|
||||
@@ -227,9 +229,14 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
|
||||
raise
|
||||
|
||||
|
||||
def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
embedding_model = model_variables.embedding_model
|
||||
def embed_chunks(tenant, catalog, document_version, chunks):
|
||||
if catalog.embedding_model:
|
||||
embedding_model, embedding_model_class = get_embedding_model_and_class(tenant.id, catalog.id,
|
||||
catalog.embedding_model)
|
||||
else:
|
||||
raise EveAIInvalidEmbeddingModel(tenant.id, catalog.id)
|
||||
|
||||
# Actually embed
|
||||
try:
|
||||
embeddings = embedding_model.embed_documents(chunks)
|
||||
except LangChainException as e:
|
||||
@@ -241,7 +248,7 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
# Add embeddings to the database
|
||||
new_embeddings = []
|
||||
for chunk, embedding in zip(chunks, embeddings):
|
||||
new_embedding = model_variables.embedding_model_class()
|
||||
new_embedding = embedding_model_class()
|
||||
new_embedding.document_version = document_version
|
||||
new_embedding.active = True
|
||||
new_embedding.chunk = chunk
|
||||
@@ -309,7 +316,7 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processo
|
||||
|
||||
return False
|
||||
|
||||
chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
|
||||
chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', ""))
|
||||
|
||||
processor.log_tuning(f'Chunking Patterns Extraction: ', {
|
||||
'Full Configuration': processor.configuration,
|
||||
|
||||
Reference in New Issue
Block a user