- Prometheus metrics go via pushgateway, as different worker processes might have different registries that are not picked up by Prometheus

This commit is contained in:
Josako
2025-03-25 15:48:00 +01:00
parent b6ee7182de
commit 4ea16521e2
9 changed files with 191 additions and 123 deletions

View File

@@ -25,6 +25,38 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Security
- In case of vulnerabilities.
## [2.2.0-alfa]
### Added
- Mistral AI as main provider for embeddings, chains and specialists
- Usage measuring for specialists
- RAG from chain to specialist technology
- Dossier catalog management possibilities added to eveai_app
- Asset definition (Paused - other priorities)
- Prometheus and Grafana
- Add prometheus monitoring to business events
- Asynchronous execution of specialists
### Changed
- Moved choice for AI providers / models to specialists and prompts
- Improve RAG to not repeat historic answers
- Fixed embedding model, no more choices allowed
- clean url (of tracking parameters) before adding it to a catalog
### Deprecated
- For soon-to-be removed features.
### Removed
- Add Multiple URLs removed from menu
- Old Specialist items removed from interaction menu
-
### Fixed
- Set default language when registering Documents or URLs.
### Security
- In case of vulnerabilities.
## [2.1.0-alfa]
### Added

View File

@@ -15,7 +15,6 @@ from .langchain.templates.template_manager import TemplateManager
from .utils.cache.eveai_cache_manager import EveAICacheManager
from .utils.simple_encryption import SimpleEncryption
from .utils.minio_utils import MinioClient
from .utils.performance_monitoring import EveAIMetrics
# Create extensions
@@ -35,5 +34,4 @@ minio_client = MinioClient()
metrics = PrometheusMetrics.for_app_factory()
template_manager = TemplateManager()
cache_manager = EveAICacheManager()
eveai_metrics = EveAIMetrics()

View File

@@ -6,16 +6,18 @@ from datetime import datetime
from typing import Dict, Any, Optional, List
from datetime import datetime as dt, timezone as tz
import logging
from prometheus_client import Counter, Histogram, Gauge, Summary
from flask import current_app
from prometheus_client import Counter, Histogram, Gauge, Summary, push_to_gateway, REGISTRY
from .business_event_context import BusinessEventContext
from common.models.entitlements import BusinessEventLog
from common.extensions import db
from .celery_utils import current_celery
from common.utils.performance_monitoring import EveAIMetrics
from common.utils.prometheus_utils import sanitize_label
# Standard duration buckets for all histograms
DURATION_BUCKETS = EveAIMetrics.get_standard_buckets()
DURATION_BUCKETS = [0.1, 0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 240, 360, float('inf')]
# Prometheus metrics for business events
TRACE_COUNTER = Counter(
@@ -112,14 +114,24 @@ class BusinessEvent:
# Prometheus label values must be strings
self.tenant_id_str = str(self.tenant_id)
self.event_type_str = sanitize_label(self.event_type)
self.specialist_id_str = str(self.specialist_id) if self.specialist_id else ""
self.specialist_type_str = str(self.specialist_type) if self.specialist_type else ""
self.specialist_type_version_str = str(self.specialist_type_version) if self.specialist_type_version else ""
self.specialist_type_version_str = sanitize_label(str(self.specialist_type_version)) \
if self.specialist_type_version else ""
self.span_name_str = ""
current_app.logger.debug(f"Labels for metrics: "
f"tenant_id={self.tenant_id_str}, "
f"event_type={self.event_type_str},"
f"specialist_id={self.specialist_id_str}, "
f"specialist_type={self.specialist_type_str}, " +
f"specialist_type_version={self.specialist_type_version_str}")
# Increment concurrent events gauge when initialized
CONCURRENT_TRACES.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
event_type=self.event_type_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
@@ -128,12 +140,14 @@ class BusinessEvent:
# Increment trace counter
TRACE_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
event_type=self.event_type_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
self._push_to_gateway()
def update_attribute(self, attribute: str, value: any):
if hasattr(self, attribute):
setattr(self, attribute, value)
@@ -143,9 +157,13 @@ class BusinessEvent:
elif attribute == 'specialist_type':
self.specialist_type_str = str(value) if value else ""
elif attribute == 'specialist_type_version':
self.specialist_type_version_str = str(value) if value else ""
self.specialist_type_version_str = sanitize_label(str(value)) if value else ""
elif attribute == 'tenant_id':
self.tenant_id_str = str(value)
elif attribute == 'event_type':
self.event_type_str = sanitize_label(value)
elif attribute == 'span_name':
self.span_name_str = sanitize_label(value)
else:
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{attribute}'")
@@ -158,13 +176,21 @@ class BusinessEvent:
self.llm_metrics['interaction_type'] = metrics['interaction_type']
# Track in Prometheus metrics
interaction_type = metrics['interaction_type']
interaction_type_str = sanitize_label(metrics['interaction_type']) if metrics['interaction_type'] else ""
current_app.logger.debug(f"Labels for metrics: "
f"tenant_id={self.tenant_id_str}, "
f"event_type={self.event_type_str},"
f"interaction_type={interaction_type_str}, "
f"specialist_id={self.specialist_id_str}, "
f"specialist_type={self.specialist_type_str}, "
f"specialist_type_version={self.specialist_type_version_str}")
# Track token usage
LLM_TOKENS_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
event_type=self.event_type_str,
interaction_type=interaction_type_str,
token_type='total',
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
@@ -173,8 +199,8 @@ class BusinessEvent:
LLM_TOKENS_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
event_type=self.event_type_str,
interaction_type=interaction_type_str,
token_type='prompt',
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
@@ -183,8 +209,8 @@ class BusinessEvent:
LLM_TOKENS_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
event_type=self.event_type_str,
interaction_type=interaction_type_str,
token_type='completion',
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
@@ -194,8 +220,8 @@ class BusinessEvent:
# Track duration
LLM_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
event_type=self.event_type_str,
interaction_type=interaction_type_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
@@ -204,13 +230,15 @@ class BusinessEvent:
# Track call count
LLM_CALLS_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
event_type=self.event_type_str,
interaction_type=interaction_type_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
self._push_to_gateway()
def reset_llm_metrics(self):
self.llm_metrics['total_tokens'] = 0
self.llm_metrics['prompt_tokens'] = 0
@@ -236,16 +264,25 @@ class BusinessEvent:
# Set the new span info
self.span_id = new_span_id
self.span_name = span_name
self.span_name_str = sanitize_label(span_name) if span_name else ""
self.parent_span_id = parent_span_id
# Track start time for the span
span_start_time = time.time()
current_app.logger.debug(f"Labels for metrics: "
f"tenant_id={self.tenant_id_str}, "
f"event_type={self.event_type_str}, "
f"activity_name={self.span_name_str}, "
f"specialist_id={self.specialist_id_str}, "
f"specialist_type={self.specialist_type_str}, "
f"specialist_type_version={self.specialist_type_version_str}")
# Increment span metrics - using span_name as activity_name for metrics
SPAN_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
event_type=self.event_type_str,
activity_name=self.span_name_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
@@ -254,13 +291,15 @@ class BusinessEvent:
# Increment concurrent spans gauge
CONCURRENT_SPANS.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
event_type=self.event_type_str,
activity_name=self.span_name_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
self._push_to_gateway()
self.log(f"Start")
try:
@@ -272,8 +311,8 @@ class BusinessEvent:
# Observe span duration
SPAN_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
event_type=self.event_type_str,
activity_name=self.span_name_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
@@ -282,13 +321,15 @@ class BusinessEvent:
# Decrement concurrent spans gauge
CONCURRENT_SPANS.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
event_type=self.event_type_str,
activity_name=self.span_name_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).dec()
self._push_to_gateway()
if self.llm_metrics['call_count'] > 0:
self.log_final_metrics()
self.reset_llm_metrics()
@@ -296,10 +337,12 @@ class BusinessEvent:
# Restore the previous span info
if self.spans:
self.span_id, self.span_name, self.parent_span_id = self.spans.pop()
self.span_name_str = sanitize_label(span_name) if span_name else ""
else:
self.span_id = None
self.span_name = None
self.parent_span_id = None
self.span_name_str = ""
@asynccontextmanager
async def create_span_async(self, span_name: str):
@@ -314,16 +357,25 @@ class BusinessEvent:
# Set the new span info
self.span_id = new_span_id
self.span_name = span_name
self.span_name_str = sanitize_label(span_name) if span_name else ""
self.parent_span_id = parent_span_id
# Track start time for the span
span_start_time = time.time()
current_app.logger.debug(f"Labels for metrics: "
f"tenant_id={self.tenant_id_str}, "
f"event_type={self.event_type_str}, "
f"activity_name={self.span_name_str}, "
f"specialist_id={self.specialist_id_str}, "
f"specialist_type={self.specialist_type_str}, "
f"specialist_type_version={self.specialist_type_version_str}")
# Increment span metrics - using span_name as activity_name for metrics
SPAN_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
event_type=self.event_type_str,
activity_name=self.span_name_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
@@ -332,13 +384,15 @@ class BusinessEvent:
# Increment concurrent spans gauge
CONCURRENT_SPANS.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
event_type=self.event_type_str,
activity_name=self.span_name_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
self._push_to_gateway()
self.log(f"Start")
try:
@@ -350,8 +404,8 @@ class BusinessEvent:
# Observe span duration
SPAN_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
event_type=self.event_type_str,
activity_name=self.span_name_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
@@ -360,13 +414,15 @@ class BusinessEvent:
# Decrement concurrent spans gauge
CONCURRENT_SPANS.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
event_type=self.event_type_str,
activity_name=self.span_name_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).dec()
self._push_to_gateway()
if self.llm_metrics['call_count'] > 0:
self.log_final_metrics()
self.reset_llm_metrics()
@@ -374,10 +430,12 @@ class BusinessEvent:
# Restore the previous span info
if self.spans:
self.span_id, self.span_name, self.parent_span_id = self.spans.pop()
self.span_name_str = sanitize_label(span_name) if span_name else ""
else:
self.span_id = None
self.span_name = None
self.parent_span_id = None
self.span_name_str = ""
def log(self, message: str, level: str = 'info', extra_fields: Dict[str, Any] = None):
log_data = {
@@ -526,6 +584,17 @@ class BusinessEvent:
# Clear the buffer after sending
self._log_buffer = []
def _push_to_gateway(self):
# Push metrics to the gateway
try:
push_to_gateway(
current_app.config['PUSH_GATEWAY_URL'],
job=current_app.config['COMPONENT_NAME'],
registry=REGISTRY
)
except Exception as e:
current_app.logger.error(f"Failed to push metrics to Prometheus Push Gateway: {e}")
def __enter__(self):
self.trace_start_time = time.time()
self.log(f'Starting Trace for {self.event_type}')
@@ -537,7 +606,7 @@ class BusinessEvent:
# Record trace duration
TRACE_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
event_type=self.event_type_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
@@ -546,18 +615,22 @@ class BusinessEvent:
# Decrement concurrent traces gauge
CONCURRENT_TRACES.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
event_type=self.event_type_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).dec()
self._push_to_gateway()
if self.llm_metrics['call_count'] > 0:
self.log_final_metrics()
self.reset_llm_metrics()
self.log(f'Ending Trace for {self.event_type}', extra_fields={'trace_duration': trace_total_time})
self._flush_log_buffer()
return BusinessEventContext(self).__exit__(exc_type, exc_val, exc_tb)
async def __aenter__(self):
@@ -571,7 +644,7 @@ class BusinessEvent:
# Record trace duration
TRACE_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
event_type=self.event_type_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
@@ -580,12 +653,14 @@ class BusinessEvent:
# Decrement concurrent traces gauge
CONCURRENT_TRACES.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
event_type=self.event_type_str,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).dec()
self._push_to_gateway()
if self.llm_metrics['call_count'] > 0:
self.log_final_metrics()
self.reset_llm_metrics()

View File

@@ -1,59 +0,0 @@
import time
import threading
from contextlib import contextmanager
from functools import wraps
from prometheus_client import Counter, Histogram, Summary, start_http_server, Gauge
from flask import current_app, g, request, Flask
class EveAIMetrics:
"""
Central class for Prometheus metrics infrastructure.
This class initializes the Prometheus HTTP server and provides
shared functionality for metrics across components.
Component-specific metrics should be defined in their respective modules.
"""
def __init__(self, app: Flask = None):
self.app = app
self._metrics_server_started = False
if app is not None:
self.init_app(app)
def init_app(self, app: Flask):
"""Initialize metrics with Flask app and start Prometheus server"""
self.app = app
self._start_metrics_server()
def _start_metrics_server(self):
"""Start the Prometheus metrics HTTP server if not already running"""
if not self._metrics_server_started:
try:
metrics_port = self.app.config.get('PROMETHEUS_PORT', 8000)
start_http_server(metrics_port)
self.app.logger.info(f"Prometheus metrics server started on port {metrics_port}")
self._metrics_server_started = True
except Exception as e:
self.app.logger.error(f"Failed to start metrics server: {e}")
@staticmethod
def get_standard_buckets():
"""
Return the standard duration buckets for histogram metrics.
Components should use these for consistency across the system.
"""
return [0.1, 0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 240, 360, float('inf')]
@staticmethod
def sanitize_label_values(labels_dict):
"""
Convert all label values to strings as required by Prometheus.
Args:
labels_dict: Dictionary of label name to label value
Returns:
Dictionary with all values converted to strings
"""
return {k: str(v) if v is not None else "" for k, v in labels_dict.items()}

View File

@@ -0,0 +1,11 @@
from flask import current_app
from prometheus_client import push_to_gateway
def sanitize_label(value):
"""Convert value to valid Prometheus label by removing/replacing invalid chars"""
if value is None:
return ""
# Replace spaces and special chars with underscores
import re
return re.sub(r'[^a-zA-Z0-9_]', '_', str(value))

View File

@@ -14,7 +14,17 @@ class Config(object):
SECRET_KEY = environ.get('SECRET_KEY')
SESSION_COOKIE_SECURE = False
SESSION_COOKIE_HTTPONLY = True
SESSION_KEY_PREFIX = f'{environ.get('COMPONENT_NAME')}_'
COMPONENT_NAME = environ.get('COMPONENT_NAME')
SESSION_KEY_PREFIX = f'{COMPONENT_NAME}_'
# Database Settings
DB_HOST = environ.get('DB_HOST')
DB_USER = environ.get('DB_USER')
DB_PASS = environ.get('DB_PASS')
DB_NAME = environ.get('DB_NAME')
DB_PORT = environ.get('DB_PORT')
SQLALCHEMY_DATABASE_URI = f'postgresql+pg8000://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
SQLALCHEMY_BINDS = {'public': SQLALCHEMY_DATABASE_URI}
WTF_CSRF_ENABLED = True
WTF_CSRF_TIME_LIMIT = None
@@ -154,12 +164,17 @@ class Config(object):
COMPRESSION_PROCESS_DELAY = 1
# WordPress Integration Settings
WORDPRESS_PROTOCOL = os.environ.get('WORDPRESS_PROTOCOL', 'http')
WORDPRESS_HOST = os.environ.get('WORDPRESS_HOST', 'host.docker.internal')
WORDPRESS_PORT = os.environ.get('WORDPRESS_PORT', '10003')
WORDPRESS_PROTOCOL = environ.get('WORDPRESS_PROTOCOL', 'http')
WORDPRESS_HOST = environ.get('WORDPRESS_HOST', 'host.docker.internal')
WORDPRESS_PORT = environ.get('WORDPRESS_PORT', '10003')
WORDPRESS_BASE_URL = f"{WORDPRESS_PROTOCOL}://{WORDPRESS_HOST}:{WORDPRESS_PORT}"
EXTERNAL_WORDPRESS_BASE_URL = 'localhost:10003'
# Prometheus PUSH Gataway
PUSH_GATEWAY_HOST = environ.get('PUSH_GATEWAY_HOST', 'pushgateway')
PUSH_GATEWAY_PORT = environ.get('PUSH_GATEWAY_PORT', '9091')
PUSH_GATEWAY_URL = f"{PUSH_GATEWAY_HOST}:{PUSH_GATEWAY_PORT}"
class DevConfig(Config):
DEVELOPMENT = True
@@ -167,14 +182,6 @@ class DevConfig(Config):
FLASK_DEBUG = True
EXPLAIN_TEMPLATE_LOADING = False
# Database Settings
DB_HOST = environ.get('DB_HOST', 'localhost')
DB_USER = environ.get('DB_USER', 'luke')
DB_PASS = environ.get('DB_PASS', 'Skywalker!')
DB_NAME = environ.get('DB_NAME', 'eveai')
SQLALCHEMY_DATABASE_URI = f'postgresql+pg8000://{DB_USER}:{DB_PASS}@{DB_HOST}:5432/{DB_NAME}'
SQLALCHEMY_BINDS = {'public': SQLALCHEMY_DATABASE_URI}
# Define the nginx prefix used for the specific apps
EVEAI_APP_LOCATION_PREFIX = '/admin'
EVEAI_CHAT_LOCATION_PREFIX = '/chat'
@@ -237,7 +244,6 @@ class DevConfig(Config):
class ProdConfig(Config):
DEVELOPMENT = False
DEBUG = False
DEBUG = False
FLASK_DEBUG = False
EXPLAIN_TEMPLATE_LOADING = False
@@ -246,15 +252,6 @@ class ProdConfig(Config):
WTF_CSRF_SSL_STRICT = True # Set to True if using HTTPS
# Database Settings
DB_HOST = environ.get('DB_HOST')
DB_USER = environ.get('DB_USER')
DB_PASS = environ.get('DB_PASS')
DB_NAME = environ.get('DB_NAME')
DB_PORT = environ.get('DB_PORT')
SQLALCHEMY_DATABASE_URI = f'postgresql+pg8000://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
SQLALCHEMY_BINDS = {'public': SQLALCHEMY_DATABASE_URI}
# flask-mailman settings
MAIL_SERVER = 'mail.askeveai.com'
MAIL_PORT = 587

View File

@@ -39,6 +39,8 @@ x-common-variables: &common-variables
LANGCHAIN_API_KEY: "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"
SERPER_API_KEY: "e4c553856d0e6b5a171ec5e6b69d874285b9badf"
CREWAI_STORAGE_DIR: "/app/crewai_storage"
PUSH_GATEWAY_HOST: "pushgateway"
PUSH_GATEWAY_PORT: "9091"
services:
nginx:
@@ -389,6 +391,14 @@ services:
networks:
- eveai-network
pushgateway:
image: prom/pushgateway:latest
restart: unless-stopped
ports:
- "9091:9091"
networks:
- eveai-network
grafana:
image: grafana/grafana:latest
container_name: grafana

View File

@@ -32,3 +32,8 @@ scrape_configs:
static_configs:
- targets: ['eveai_entitlements:8000']
scrape_interval: 10s
- job_name: 'pushgateway'
honor_labels: true
static_configs:
- targets: [ 'pushgateway:9091' ]

View File

@@ -5,7 +5,7 @@ import os
from common.langchain.templates.template_manager import TemplateManager
from common.utils.celery_utils import make_celery, init_celery
from common.extensions import db, template_manager, cache_manager, eveai_metrics
from common.extensions import db, template_manager, cache_manager
from config.logging_config import LOGGING
from config.config import get_config
@@ -45,7 +45,6 @@ def register_extensions(app):
db.init_app(app)
cache_manager.init_app(app)
template_manager.init_app(app)
eveai_metrics.init_app(app)
def register_cache_handlers(app):