from datetime import datetime as dt, timezone as tz from flask import current_app from langchain_mistralai import MistralAIEmbeddings from langchain_openai import OpenAIEmbeddings from langchain_community.document_loaders.pdf import PyPDFLoader from langchain_community.vectorstores.chroma import Chroma from langchain_text_splitters import CharacterTextSplitter import os from eveai_app import celery from ..utils.database import Database from ..models.document import DocumentVersion, EmbeddingMistral, EmbeddingSmallOpenAI from .. import db @celery.task(name='create_embeddings', queue='embeddings') def create_embeddings(tenant_id, document_version_id, embedding_model_def): current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id} ' f'with model {embedding_model_def}') Database(tenant_id).switch_schema() document_version = DocumentVersion.query.get(document_version_id) if document_version is None: current_app.logger.error(f'Cannot create embeddings for tenant {tenant_id}. ' f'Document version {document_version_id} not found') return db.session.add(document_version) # start processing document_version.processing = True document_version.processing_started_at = dt.now(tz.utc) db.session.commit() embedding_provider = embedding_model_def.rsplit('.', 1)[0] embedding_model = embedding_model_def.rsplit('.', 1)[1] # define embedding variables match (embedding_provider, embedding_model): case ('openai', 'text-embedding-3-small'): embedding_model = EmbeddingSmallOpenAI() case ('mistral', 'text-embedding-3-small'): embedding_model = EmbeddingMistral() match document_version.file_type: case 'pdf': pdf_file = os.path.join(current_app.config['UPLOAD_FOLDER'], document_version.file_location, document_version.file_path) loader = PyPDFLoader(pdf_file) # We text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) documents = text_splitter.split_documents(loader.load()) pass @celery.task(name='ask_eveAI', queue='llm_interactions') def ask_eve_ai(query): # Interaction logic with LLMs like GPT (Langchain API calls, etc.) pass