Refactoring part 1

Some changes for workers, but stopped due to refactoring
This commit is contained in:
Josako
2024-05-06 21:30:07 +02:00
parent d925477e68
commit 8e5ad5f312
34 changed files with 193 additions and 109 deletions

View File

@@ -1,4 +1,4 @@
from ..extensions import db
from common.extensions import db
from .user import User, Tenant
from pgvector.sqlalchemy import Vector

View File

View File

@@ -1,4 +1,4 @@
from ..extensions import db
from common.extensions import db
from flask_security import UserMixin, RoleMixin
from sqlalchemy.dialects.postgresql import ARRAY
import sqlalchemy as sa

0
common/utils/__init__.py Normal file
View File

View File

@@ -6,7 +6,7 @@ from sqlalchemy.exc import InternalError
from sqlalchemy.orm import sessionmaker, scoped_session
from flask import current_app
from ..extensions import db, migrate
from common.extensions import db, migrate
class Database:

View File

@@ -6,7 +6,6 @@ for handling tenant requests
from flask_security import current_user
from flask import session
from ..models.user import User, Tenant
from .database import Database

View File

@@ -1,5 +1,5 @@
from flask import session
from ..models.user import User, Tenant
from common.models import User, Tenant
# Definition of Trigger Handlers
@@ -10,3 +10,9 @@ def set_tenant_session_data(sender, user, **kwargs):
session['default_embedding_model'] = tenant.default_embedding_model
session['default_llm_model'] = tenant.default_llm_model
def clear_tenant_session_data(sender, user, **kwargs):
session.pop('tenant', None)
session.pop('default_language', None)
session.pop('default_embedding_model', None)
session.pop('default_llm_model', None)

0
config/__init__.py Normal file
View File

View File

@@ -8,13 +8,15 @@ class Config(object):
DEBUG = False
DEVELOPMENT = False
SECRET_KEY = '97867c1491bea5ee6a8e8436eb11bf2ba6a69ff53ab1b17ecba450d0f2e572e1'
SESSION_COOKIE_SECURE = True
SESSION_COOKIE_HTTPONLY = True
# WTF_CSRF_ENABLED = True
# flask-security-too settings
SECURITY_PASSWORD_SALT = '228614859439123264035565568761433607235'
# REMEMBER_COOKIE_SAMESITE = 'strict'
# SESSION_COOKIE_SAMESITE = 'strict'
REMEMBER_COOKIE_SAMESITE = 'strict'
SESSION_COOKIE_SAMESITE = 'strict'
SECURITY_CONFIRMABLE = True
SECURITY_TRACKABLE = True
SECURITY_PASSWORD_COMPLEXITY_CHECKER = 'zxcvbn'
@@ -22,6 +24,7 @@ class Config(object):
SECURITY_RECOVERABLE = True
SECURITY_EMAIL_SENDER = "eveai_super@flow-it.net"
PERMANENT_SESSION_LIFETIME = timedelta(minutes=60)
SESSION_REFRESH_EACH_REQUEST = True
# flask-mailman settings
MAIL_SERVER = 'mail.flow-it.net'

View File

@@ -5,14 +5,13 @@ from flask_security import SQLAlchemyUserDatastore
from flask_security.signals import user_authenticated
from werkzeug.middleware.proxy_fix import ProxyFix
import logging.config
from celery import Celery
from .extensions import db, migrate, bootstrap, security, mail, login_manager, cors
from .models.user import User, Tenant, Role
from .models.document import Document, DocumentLanguage, DocumentVersion
from .logging_config import LOGGING
from .utils.security import set_tenant_session_data
from .worker.celery_utils import init_celery
from common.extensions import db, migrate, bootstrap, security, mail, login_manager, cors
from common.models.user import User, Role
from config.logging_config import LOGGING
from common.utils.security import set_tenant_session_data
from .errors import register_error_handlers
from eveai_workers.celery_utils import init_celery
def create_app(config_file=None):
@@ -20,7 +19,7 @@ def create_app(config_file=None):
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1)
if config_file is None:
app.config.from_object('config.DevConfig')
app.config.from_object('config.config.DevConfig')
else:
app.config.from_object(config_file)
@@ -31,6 +30,10 @@ def create_app(config_file=None):
logging.config.dictConfig(LOGGING)
register_extensions(app)
# Initialize celery
init_celery(app)
# Setup Flask-Security-Too
user_datastore = SQLAlchemyUserDatastore(db, User, Role)
security.init_app(app, user_datastore)
@@ -39,6 +42,10 @@ def create_app(config_file=None):
# Register Blueprints
register_blueprints(app)
# Register Error Handlers
register_error_handlers(app)
# Debugging settings
if app.config['DEBUG'] is True:
app.logger.setLevel(logging.DEBUG)
mail_logger = logging.getLogger('flask_mailman')
@@ -79,17 +86,3 @@ def register_api(app):
# from . import api
# app.register_blueprint(api.bp, url_prefix='/api')
def create_celery_app(config_file=None):
app = Flask(__name__)
if config_file is None:
app.config.from_object('config.DevConfig')
else:
app.config.from_object(config_file)
celery = Celery(app.import_name)
init_celery(celery, app)
return celery
celery = create_celery_app()

22
eveai_app/errors.py Normal file
View File

@@ -0,0 +1,22 @@
from flask import render_template, request, jsonify
def not_found_error(error):
if request.accept_mimetypes.accept_json and not request.accept_mimetypes.accept_html:
response = jsonify({'error': 'Not found'})
response.status_code = 404
return response
return render_template('error/404.html'), 404
def internal_server_error(error):
if request.accept_mimetypes.accept_json and not request.accept_mimetypes.accept_html:
response = jsonify({'error': 'Internal server error'})
response.status_code = 500
return response
return render_template('error/500.html'), 500
def register_error_handlers(app):
app.register_error_handler(404, not_found_error)
app.register_error_handler(500, internal_server_error)

View File

@@ -20,7 +20,7 @@
<span>
<div class="container mb-4">
<div class="row mt-lg-n12 mt-md-n12 mt-n12 justify-content-center">
<div class="col-xl-8 col-lg-5 col-md-7 mx-auto">
{% block content_class %}<div class="col-xl-8 col-lg-5 col-md-7 mx-auto">{% endblock %}
<div class="card mt-8">
<div class="card-header p-0 position-relative mt-n4 mx-3 z-index-2">
<div class="bg-gradient-success shadow-success border-radius-lg py-3 pe-1 text-center py-4">

View File

@@ -5,6 +5,7 @@
{% block content_title %}Documents{% endblock %}
{% block content_description %}View Documents for Tenant{% endblock %}
{% block content_class %}<div class="col-xl-12 col-lg-5 col-md-7 mx-auto">{% endblock %}
{% block content %}
<div class="container">

View File

@@ -0,0 +1,9 @@
{% extends "base.html" %}
{% block title %}Error 404{% endblock %}
{% block content_title %}File not Found{% endblock %}
{% block content_description %}Something unexpected happened!{% endblock %}
{% block content %}
<p><a href="{{ url_for('basic_bp.index') }}">Return home</a></p>
{% endblock %}

View File

@@ -0,0 +1,9 @@
{% extends "base.html" %}
{% block title %}Error 500{% endblock %}
{% block content_title %}Internal Server error{% endblock %}
{% block content_description %}Something unexpected happened! The administrator has been notified.{% endblock %}
{% block content %}
<p><a href="{{ url_for('basic_bp.index') }}">Return home</a></p>
{% endblock %}

View File

@@ -1,15 +1,17 @@
import os
from datetime import datetime as dt, timezone as tz
from flask import request, redirect, url_for, flash, render_template, Blueprint, session, current_app
from flask_security import hash_password, roles_required, roles_accepted, current_user
from flask_security import roles_accepted, current_user
from sqlalchemy import desc
from sqlalchemy.orm import joinedload
from werkzeug.utils import secure_filename
from ..models.document import Document, DocumentLanguage, DocumentVersion
from ..extensions import db
from common.models import Document, DocumentLanguage, DocumentVersion
from common.extensions import db
from .document_forms import AddDocumentForm
from ..utils.middleware import mw_before_request
from common.utils.middleware import mw_before_request
from eveai_workers.tasks import create_embeddings
document_bp = Blueprint('document_bp', __name__, url_prefix='/document')
@@ -68,11 +70,17 @@ def add_document():
if error is None:
flash('Document added successfully.', 'success')
upload_file_for_version(new_doc_vers, file, extension)
create_embeddings.delay(tenant_id=session['tenant']['id'],
document_version_id=new_doc_vers.id,
default_embedding_model=session['default_embedding_model'])
current_app.logger.info(f'Document processing started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}')
print('Processing should start soon')
else:
flash('Error adding document.', 'error')
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {error}')
# return render_template('document/add_document.html', form=form)
return render_template('document/add_document.html', form=form)
@document_bp.route('/documents', methods=['GET', 'POST'])

View File

@@ -3,7 +3,7 @@ from flask_wtf import FlaskForm
from wtforms import (StringField, PasswordField, BooleanField, SubmitField, EmailField, IntegerField, DateField,
SelectField, SelectMultipleField, FieldList, FormField)
from wtforms.validators import DataRequired, Length, Email, NumberRange, Optional
from ..models.user import User, Role
from common.models import Role
class TenantForm(FlaskForm):

View File

@@ -4,10 +4,10 @@ from datetime import datetime as dt, timezone as tz
from flask import request, redirect, url_for, flash, render_template, Blueprint, session, current_app
from flask_security import hash_password, roles_required, roles_accepted
from ..models.user import User, Tenant, Role
from ..extensions import db
from common.models import User, Tenant, Role
from common.extensions import db
from .user_forms import TenantForm, CreateUserForm, EditUserForm
from ..utils.database import Database
from common.utils.database import Database
user_bp = Blueprint('user_bp', __name__, url_prefix='/user')

View File

@@ -1,9 +0,0 @@
def init_celery(celery, app):
celery.conf.update(app.config) # Load all configurations form Flask app including Queue settings
class ContextTask(celery.Task):
def __call__(self, *args, **kwargs):
with app.app_context():
return self.run(*args, **kwargs)
celery.Task = ContextTask

View File

@@ -1,59 +0,0 @@
from datetime import datetime as dt, timezone as tz
from flask import current_app
from langchain_mistralai import MistralAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.vectorstores.chroma import Chroma
from langchain_text_splitters import CharacterTextSplitter
import os
from eveai_app import celery
from ..utils.database import Database
from ..models.document import DocumentVersion, EmbeddingMistral, EmbeddingSmallOpenAI
from .. import db
@celery.task(name='create_embeddings', queue='embeddings')
def create_embeddings(tenant_id, document_version_id, embedding_model_def):
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id} '
f'with model {embedding_model_def}')
Database(tenant_id).switch_schema()
document_version = DocumentVersion.query.get(document_version_id)
if document_version is None:
current_app.logger.error(f'Cannot create embeddings for tenant {tenant_id}. '
f'Document version {document_version_id} not found')
return
db.session.add(document_version)
# start processing
document_version.processing = True
document_version.processing_started_at = dt.now(tz.utc)
db.session.commit()
embedding_provider = embedding_model_def.rsplit('.', 1)[0]
embedding_model = embedding_model_def.rsplit('.', 1)[1]
# define embedding variables
match (embedding_provider, embedding_model):
case ('openai', 'text-embedding-3-small'):
embedding_model = EmbeddingSmallOpenAI()
case ('mistral', 'text-embedding-3-small'):
embedding_model = EmbeddingMistral()
match document_version.file_type:
case 'pdf':
pdf_file = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location,
document_version.file_path)
loader = PyPDFLoader(pdf_file)
# We
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(loader.load())
pass
@celery.task(name='ask_eveAI', queue='llm_interactions')
def ask_eve_ai(query):
# Interaction logic with LLMs like GPT (Langchain API calls, etc.)
pass

View File

@@ -0,0 +1,23 @@
from .tasks import create_embeddings
from celery import Celery, Task
def init_celery(app):
class ContextTask(Task):
def __call__(self, *args, **kwargs):
with app.app_context():
return self.run(*args, **kwargs)
celery_app = Celery(app.import_name, task_cls=ContextTask)
celery_app.conf.broker_url = app.config.get('CELERY_BROKER_URL')
celery_app.conf.result_backend = app.config.get('CELERY_RESULT_BACKEND')
celery_app.conf.accept_content = app.config.get('CELERY_ACCEPT_CONTENT')
celery_app.conf.task_serializer = app.config.get('CELERY_TASK_SERIALIZER')
celery_app.conf.timezone = app.config.get('CELERY_TIMEZONE')
celery_app.conf.enable_utc = app.config.get('CELERY_ENABLE_UTC')
celery_app.set_default()
app.extensions['celery'] = celery_app

67
eveai_workers/tasks.py Normal file
View File

@@ -0,0 +1,67 @@
from datetime import datetime as dt, timezone as tz
from flask import current_app
from langchain_community.document_loaders.unstructured import UnstructuredAPIFileLoader
import os
from celery import shared_task
from common.utils.database import Database
from common.models import DocumentVersion, EmbeddingMistral, EmbeddingSmallOpenAI
from eveai_app import db
@shared_task(name='create_embeddings', queue='embeddings')
def create_embeddings(tenant_id, document_version_id, default_embedding_model):
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id} '
f'with model {default_embedding_model}')
# Ensure we are working in the correct database schema
Database(tenant_id).switch_schema()
# Retrieve document version to process
document_version = DocumentVersion.query.get(document_version_id)
if document_version is None:
current_app.logger.error(f'Cannot create embeddings for tenant {tenant_id}. '
f'Document version {document_version_id} not found')
return
db.session.add(document_version)
# start processing
document_version.processing = True
document_version.processing_started_at = dt.now(tz.utc)
db.session.commit()
embedding_provider = default_embedding_model.rsplit('.', 1)[0]
embedding_model = default_embedding_model.rsplit('.', 1)[1]
# define embedding variables
match (embedding_provider, embedding_model):
case ('openai', 'text-embedding-3-small'):
embedding_model = EmbeddingSmallOpenAI()
case ('mistral', 'text-embedding-3-small'):
embedding_model = EmbeddingMistral()
match document_version.file_type:
case 'pdf':
url = current_app.config.get('UNSTRUCTURED_FULL_URL')
api_key = current_app.config.get('UNSTRUCTURED_API_KEY')
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location,
document_version.file_path)
with open(file_path, 'rb') as f:
loader = UnstructuredAPIFileLoader(f,
url=url,
api_key=api_key,
mode='elements',
strategy='hi-res',
include_page_breaks=True,
unique_element_ids=True,
chunking_strategy='by_title',
max_characters=3000,
)
documents = loader.load()
print(documents)
@shared_task(name='ask_eve_ai', queue='llm_interactions')
def ask_eve_ai(query):
# Interaction logic with LLMs like GPT (Langchain API calls, etc.)
pass

View File

@@ -6,7 +6,7 @@ from flask import current_app
from alembic import context
from sqlalchemy import NullPool, engine_from_config, text
from eveai_app.models.user import Tenant
from common.models import Tenant
import pgvector
from pgvector.sqlalchemy import Vector

5
scripts/run_celery.py Normal file
View File

@@ -0,0 +1,5 @@
from eveai_app import create_app
flask_app = create_app()
celery_app = flask_app.extensions['celery']
print(flask_app.extensions)

View File

@@ -1,8 +1,9 @@
from eveai_app import create_app
from gevent.pywsgi import WSGIServer
app = create_app()
print(__name__)
celery_app = app.extensions['celery']
if __name__ == '__main__':
print("Server starting on port 5000")

View File

@@ -0,0 +1,3 @@
#!/usr/bin/env bash
source .venv/bin/activate
celery -A app.celery_app worker --loglevel=info -Q embeddings

3
scripts/start_flower.sh Executable file
View File

@@ -0,0 +1,3 @@
#!/usr/bin/env bash
source .venv/bin/activate
celery -A app.celery_app flower