eveAI/config/config.py

from os import environ, path
from datetime import timedelta
import redis

basedir = path.abspath(path.dirname(__file__))


class Config(object):
    DEBUG = False
    DEVELOPMENT = False
    SECRET_KEY = '97867c1491bea5ee6a8e8436eb11bf2ba6a69ff53ab1b17ecba450d0f2e572e1'
    SESSION_COOKIE_SECURE = False
    SESSION_COOKIE_HTTPONLY = True

    WTF_CSRF_ENABLED = True

    # flask-security-too settings
    # SECURITY_URL_PREFIX = '/admin'
    SECURITY_LOGIN_URL = '/admin/login'
    SECURITY_LOGOUT_URL = '/admin/logout'
    # SECURITY_REGISTER_URL = '/admin/register'
    # SECURITY_RESET_URL = '/admin/reset'
    # SECURITY_CHANGE_URL = '/admin/change'
    # SECURITY_POST_LOGIN_VIEW = '/admin/user/tenant_overview'
    # SECURITY_POST_LOGOUT_VIEW = '/admin'
    # SECURITY_POST_REGISTER_VIEW = '/admin/user/tenant_overview'
    # SECURITY_POST_RESET_VIEW = '/admin/login'
    # SECURITY_POST_CHANGE_VIEW = '/admin/login'
    # SECURITY_BLUEPRINT_NAME = 'security_bp'
    SECURITY_PASSWORD_SALT = '228614859439123264035565568761433607235'
    REMEMBER_COOKIE_SAMESITE = 'strict'
    SESSION_COOKIE_SAMESITE = 'strict'
    SECURITY_CONFIRMABLE = True
    SECURITY_TRACKABLE = True
    SECURITY_PASSWORD_COMPLEXITY_CHECKER = 'zxcvbn'
    SECURITY_POST_LOGIN_VIEW = '/user/tenant_overview'
    SECURITY_RECOVERABLE = True
    SECURITY_EMAIL_SENDER = "eveai_super@flow-it.net"

    # Ensure Flask-Security-Too is handling CSRF tokens when behind a proxy
    SECURITY_CSRF_PROTECT_MECHANISMS = ['session']
    SECURITY_CSRF_COOKIE_NAME = 'XSRF-TOKEN'
    SECURITY_CSRF_HEADER = 'X-XSRF-TOKEN'
    WTF_CSRF_CHECK_DEFAULT = False

    # flask-mailman settings
    MAIL_SERVER = 'mail.flow-it.net'
    MAIL_PORT = 587
    MAIL_USE_TLS = True
    MAIL_USE_SSL = False
    MAIL_DEFAULT_SENDER = ('eveAI Admin', 'eveai_admin@flow-it.net')

    # file upload settings
    MAX_CONTENT_LENGTH = 16 * 1024 * 1024
    UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif']

    # supported languages
    SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']

    # supported LLMs
    SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
    SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo']

    # Celery settings
    CELERY_TASK_SERIALIZER = 'json'
    CELERY_RESULT_SERIALIZER = 'json'
    CELERY_ACCEPT_CONTENT = ['json']
    CELERY_TIMEZONE = 'UTC'
    CELERY_ENABLE_UTC = True

    # Chunk Definition, Embedding dependent
    # OAI_TE3S_MIN_CHUNK_SIZE = 2000
    # OAI_TE3S_MAX_CHUNK_SIZE = 3000
    # OAI_TE3L_MIN_CHUNK_SIZE = 3000
    # OAI_TE3L_MAX_CHUNK_SIZE = 4000

    # LLM TEMPLATES
    GPT4_HTML_PARSE_TEMPLATE = """You are a top administrative assistant specialized in transforming given HTML into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.

    # Best practices are:
    - Respect wordings and language(s) used in the HTML.
    - The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
    - Sub-headers can be used as lists. This is true when a header is followed by a series of sub-headers without content (paragraphs or listed items). Present those sub-headers as a list.
    - Be careful of encoding of the text. Everything needs to be human readable.

    Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input html file. Answer with the pure markdown, without any other text.

    HTML is between triple backquotes.

    ```{html}```"""

    GPT4_PDF_PARSE_TEMPLATE = """You are a top administrative aid specialized in transforming given PDF-files into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.

    # Best practices are:
    - Respect wordings and language(s) used in the PDF.
    - The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
    - When headings are numbered, show the numbering and define the header level.
    - A new item is started when a <return> is found before a full line is reached. In order to know the number of characters in a line, please check the document and the context within the document (e.g. an image could limit the number of characters temporarily).
    - Paragraphs are to be stripped of newlines so they become easily readable.
    - Be careful of encoding of the text. Everything needs to be human readable.

    Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input pdf content. Answer with the pure markdown, without any other text.

    PDF content is between triple backquotes.

    ```{pdf_content}```
    """

    GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes.
    ```{text}```"""

    GPT4_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes.
    {tenant_context}
    Use the following {language} in your communication, and cite the sources used.
    If the question cannot be answered using the given context, say "I have insufficient information to answer this question."
    Context:
    ```{context}```
    Question:
    {question}"""

    GPT4_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context,
    in such a way that the question is understandable without the previous context.
    {tenant_context}
    The context is a conversation history, with the HUMAN asking questions, the AI answering questions.
    The history is delimited between triple backquotes.
    You answer by stating the question in {language}.
    History:
    ```{history}```
    Question to be detailed:
    {question}"""

    GPT4_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of
    'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question.
    If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
    Question:
    {question}"""

    GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
    and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
    Do the following:
    - divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
    - annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
    - improve errors in the transcript given the context, but leave the text intact.

    ```{transcript}```
    """

    # SocketIO settings
    # SOCKETIO_ASYNC_MODE = 'threading'
    SOCKETIO_ASYNC_MODE = 'gevent'

    # Session Settings
    SESSION_TYPE = 'redis'
    SESSION_PERMANENT = False
    SESSION_USE_SIGNER = True
    PERMANENT_SESSION_LIFETIME = timedelta(minutes=60)
    SESSION_REFRESH_EACH_REQUEST = True

    # Fallback Algorithms
    FALLBACK_ALGORITHMS = [
        "RAG_TENANT",
        "RAG_WIKIPEDIA",
        "RAG_GOOGLE",
        "LLM"
    ]

    # Interaction algorithms
    INTERACTION_ALGORITHMS = {
        "RAG_TENANT": {"name": "RAG_TENANT", "description": "Algorithm using only information provided by the tenant"},
        "RAG_WIKIPEDIA": {"name": "RAG_WIKIPEDIA", "description": "Algorithm using information provided by Wikipedia"},
        "RAG_GOOGLE": {"name": "RAG_GOOGLE", "description": "Algorithm using information provided by Google"},
        "LLM": {"name": "LLM", "description": "Algorithm using information integrated in the used LLM"}
    }


class DevConfig(Config):
    DEVELOPMENT = True
    DEBUG = True
    FLASK_DEBUG = True
    PYCHARM_DEBUG = False
    DB_HOST = environ.get('DB_HOST', 'localhost')
    DB_USER = environ.get('DB_USER', 'luke')
    DB_PASS = environ.get('DB_PASS', 'Skywalker!')
    DB_NAME = environ.get('DB_NAME', 'eveai')

    SQLALCHEMY_DATABASE_URI = f'postgresql+pg8000://{DB_USER}:{DB_PASS}@{DB_HOST}:5432/{DB_NAME}'
    SQLALCHEMY_BINDS = {'public': SQLALCHEMY_DATABASE_URI}
    EXPLAIN_TEMPLATE_LOADING = False

    # Define the nginx prefix used for the specific apps
    EVEAI_APP_LOCATION_PREFIX = '/admin'
    EVEAI_CHAT_LOCATION_PREFIX = '/chat'

    # flask-mailman settings
    MAIL_USERNAME = 'eveai_super@flow-it.net'
    MAIL_PASSWORD = '$6xsWGbNtx$CFMQZqc*'

    # file upload settings
    UPLOAD_FOLDER = '/app/tenant_files'

    # Celery settings
    # eveai_app Redis Settings
    CELERY_BROKER_URL = 'redis://redis:6379/0'
    CELERY_RESULT_BACKEND = 'redis://redis:6379/0'
    # eveai_chat Redis Settings
    CELERY_BROKER_URL_CHAT = 'redis://redis:6379/3'
    CELERY_RESULT_BACKEND_CHAT = 'redis://redis:6379/3'

    # OpenAI API Keys
    OPENAI_API_KEY = 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7'

    # Groq API Keys
    GROQ_API_KEY = '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'

    # Unstructured settings
    UNSTRUCTURED_API_KEY = 'pDgCrXumYhM3CNvjvwV8msMldXC3uw'
    UNSTRUCTURED_BASE_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io'
    UNSTRUCTURED_FULL_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io/general/v0/general'

    # SocketIO settings
    SOCKETIO_MESSAGE_QUEUE = 'redis://redis:6379/1'
    SOCKETIO_CORS_ALLOWED_ORIGINS = '*'
    SOCKETIO_LOGGER = True
    SOCKETIO_ENGINEIO_LOGGER = True
    SOCKETIO_PING_TIMEOUT = 20000
    SOCKETIO_PING_INTERVAL = 25000
    SOCKETIO_MAX_IDLE_TIME = timedelta(minutes=60)  # Changing this value ==> change maxConnectionDuration value in
                                                    # eveai-chat-widget.js

    # Google Cloud settings
    GC_PROJECT_NAME = 'eveai-420711'
    GC_LOCATION = 'europe-west1'
    GC_KEY_RING = 'eveai-chat'
    GC_CRYPTO_KEY = 'envelope-encryption-key'

    # JWT settings
    JWT_SECRET_KEY = 'bsdMkmQ8ObfMD52yAFg4trrvjgjMhuIqg2fjDpD/JqvgY0ccCcmlsEnVFmR79WPiLKEA3i8a5zmejwLZKl4v9Q=='

    # Session settings
    SESSION_REDIS = redis.from_url('redis://redis:6379/2')

    # PATH settings
    ffmpeg_path = '/usr/bin/ffmpeg'


class ProdConfig(Config):
    DEVELOPMENT = False
    DEBUG = False
    # SQLALCHEMY_DATABASE_URI = environ.get('SQLALCHEMY_DATABASE_URI') or \
    #                           'sqlite:///' + os.path.join(basedir, 'db.sqlite')


config = {
    'dev': DevConfig(),
    'prod': ProdConfig(),
    'default': DevConfig(),
}