- Adding Prometheus and grafana services in development

- Adding Prometheus metrics to the business events
- Ensure asynchronous behaviour of crewai specialists.
- Adapt Business events to working in mixed synchronous / asynchronous contexts
- Extend business events with specialist information
- Started adding a grafana dashboard (TBC)
This commit is contained in:
Josako
2025-03-24 16:39:22 +01:00
parent 238bdb58f4
commit b6ee7182de
25 changed files with 1337 additions and 83 deletions

2
.gitignore vendored
View File

@@ -50,3 +50,5 @@ scripts/__pycache__/run_eveai_app.cpython-312.pyc
/tests/interactive_client/specialist_client.log /tests/interactive_client/specialist_client.log
/.repopackignore /.repopackignore
/patched_packages/crewai/ /patched_packages/crewai/
/docker/prometheus/data/
/docker/grafana/data/

View File

@@ -5,7 +5,7 @@ from typing import List
class EveAIEmbeddings: class EveAIEmbeddings:
@abstractmethod @abstractmethod
def embed_documents(self, texts: List[str]) -> List[List[float]]: def embed_documents(self, texts: List[str]) -> List[List[float]]:
pass raise NotImplementedError
def embed_query(self, text: str) -> List[float]: def embed_query(self, text: str) -> List[float]:
return self.embed_documents([text])[0] return self.embed_documents([text])[0]

View File

@@ -15,6 +15,7 @@ from .langchain.templates.template_manager import TemplateManager
from .utils.cache.eveai_cache_manager import EveAICacheManager from .utils.cache.eveai_cache_manager import EveAICacheManager
from .utils.simple_encryption import SimpleEncryption from .utils.simple_encryption import SimpleEncryption
from .utils.minio_utils import MinioClient from .utils.minio_utils import MinioClient
from .utils.performance_monitoring import EveAIMetrics
# Create extensions # Create extensions
@@ -33,6 +34,6 @@ simple_encryption = SimpleEncryption()
minio_client = MinioClient() minio_client = MinioClient()
metrics = PrometheusMetrics.for_app_factory() metrics = PrometheusMetrics.for_app_factory()
template_manager = TemplateManager() template_manager = TemplateManager()
# Caching classes
cache_manager = EveAICacheManager() cache_manager = EveAICacheManager()
eveai_metrics = EveAIMetrics()

View File

@@ -15,6 +15,9 @@ class BusinessEventLog(db.Model):
parent_span_id = db.Column(db.String(50)) parent_span_id = db.Column(db.String(50))
document_version_id = db.Column(db.Integer) document_version_id = db.Column(db.Integer)
document_version_file_size = db.Column(db.Float) document_version_file_size = db.Column(db.Float)
specialist_id = db.Column(db.Integer)
specialist_type = db.Column(db.String(50))
specialist_type_version = db.Column(db.String(20))
chat_session_id = db.Column(db.String(50)) chat_session_id = db.Column(db.String(50))
interaction_id = db.Column(db.Integer) interaction_id = db.Column(db.Integer)
environment = db.Column(db.String(20)) environment = db.Column(db.String(20))

View File

@@ -1,16 +1,81 @@
import os import os
import time import time
import uuid import uuid
from contextlib import contextmanager from contextlib import contextmanager, asynccontextmanager
from datetime import datetime from datetime import datetime
from typing import Dict, Any, Optional, List from typing import Dict, Any, Optional, List
from datetime import datetime as dt, timezone as tz from datetime import datetime as dt, timezone as tz
import logging import logging
from prometheus_client import Counter, Histogram, Gauge, Summary
from .business_event_context import BusinessEventContext from .business_event_context import BusinessEventContext
from common.models.entitlements import BusinessEventLog from common.models.entitlements import BusinessEventLog
from common.extensions import db from common.extensions import db
from .celery_utils import current_celery from .celery_utils import current_celery
from common.utils.performance_monitoring import EveAIMetrics
# Standard duration buckets for all histograms
DURATION_BUCKETS = EveAIMetrics.get_standard_buckets()
# Prometheus metrics for business events
TRACE_COUNTER = Counter(
'eveai_business_events_total',
'Total number of business events triggered',
['tenant_id', 'event_type', 'specialist_id', 'specialist_type', 'specialist_type_version']
)
TRACE_DURATION = Histogram(
'eveai_business_events_duration_seconds',
'Duration of business events in seconds',
['tenant_id', 'event_type', 'specialist_id', 'specialist_type', 'specialist_type_version'],
buckets=DURATION_BUCKETS
)
CONCURRENT_TRACES = Gauge(
'eveai_business_events_concurrent',
'Number of concurrent business events',
['tenant_id', 'event_type', 'specialist_id', 'specialist_type', 'specialist_type_version']
)
SPAN_COUNTER = Counter(
'eveai_business_spans_total',
'Total number of spans within business events',
['tenant_id', 'event_type', 'activity_name', 'specialist_id', 'specialist_type', 'specialist_type_version']
)
SPAN_DURATION = Histogram(
'eveai_business_spans_duration_seconds',
'Duration of spans within business events in seconds',
['tenant_id', 'event_type', 'activity_name', 'specialist_id', 'specialist_type', 'specialist_type_version'],
buckets=DURATION_BUCKETS
)
CONCURRENT_SPANS = Gauge(
'eveai_business_spans_concurrent',
'Number of concurrent spans within business events',
['tenant_id', 'event_type', 'activity_name', 'specialist_id', 'specialist_type', 'specialist_type_version']
)
# LLM Usage metrics
LLM_TOKENS_COUNTER = Counter(
'eveai_llm_tokens_total',
'Total number of tokens used in LLM calls',
['tenant_id', 'event_type', 'interaction_type', 'token_type', 'specialist_id', 'specialist_type',
'specialist_type_version']
)
LLM_DURATION = Histogram(
'eveai_llm_duration_seconds',
'Duration of LLM API calls in seconds',
['tenant_id', 'event_type', 'interaction_type', 'specialist_id', 'specialist_type', 'specialist_type_version'],
buckets=DURATION_BUCKETS
)
LLM_CALLS_COUNTER = Counter(
'eveai_llm_calls_total',
'Total number of LLM API calls',
['tenant_id', 'event_type', 'interaction_type', 'specialist_id', 'specialist_type', 'specialist_type_version']
)
class BusinessEvent: class BusinessEvent:
@@ -29,6 +94,9 @@ class BusinessEvent:
self.document_version_file_size = kwargs.get('document_version_file_size') self.document_version_file_size = kwargs.get('document_version_file_size')
self.chat_session_id = kwargs.get('chat_session_id') self.chat_session_id = kwargs.get('chat_session_id')
self.interaction_id = kwargs.get('interaction_id') self.interaction_id = kwargs.get('interaction_id')
self.specialist_id = kwargs.get('specialist_id')
self.specialist_type = kwargs.get('specialist_type')
self.specialist_type_version = kwargs.get('specialist_type_version')
self.environment = os.environ.get("FLASK_ENV", "development") self.environment = os.environ.get("FLASK_ENV", "development")
self.span_counter = 0 self.span_counter = 0
self.spans = [] self.spans = []
@@ -42,9 +110,42 @@ class BusinessEvent:
} }
self._log_buffer = [] self._log_buffer = []
# Prometheus label values must be strings
self.tenant_id_str = str(self.tenant_id)
self.specialist_id_str = str(self.specialist_id) if self.specialist_id else ""
self.specialist_type_str = str(self.specialist_type) if self.specialist_type else ""
self.specialist_type_version_str = str(self.specialist_type_version) if self.specialist_type_version else ""
# Increment concurrent events gauge when initialized
CONCURRENT_TRACES.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
# Increment trace counter
TRACE_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
def update_attribute(self, attribute: str, value: any): def update_attribute(self, attribute: str, value: any):
if hasattr(self, attribute): if hasattr(self, attribute):
setattr(self, attribute, value) setattr(self, attribute, value)
# Update string versions for Prometheus labels if needed
if attribute == 'specialist_id':
self.specialist_id_str = str(value) if value else ""
elif attribute == 'specialist_type':
self.specialist_type_str = str(value) if value else ""
elif attribute == 'specialist_type_version':
self.specialist_type_version_str = str(value) if value else ""
elif attribute == 'tenant_id':
self.tenant_id_str = str(value)
else: else:
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{attribute}'") raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{attribute}'")
@@ -56,6 +157,60 @@ class BusinessEvent:
self.llm_metrics['call_count'] += 1 self.llm_metrics['call_count'] += 1
self.llm_metrics['interaction_type'] = metrics['interaction_type'] self.llm_metrics['interaction_type'] = metrics['interaction_type']
# Track in Prometheus metrics
interaction_type = metrics['interaction_type']
# Track token usage
LLM_TOKENS_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
token_type='total',
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc(metrics['total_tokens'])
LLM_TOKENS_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
token_type='prompt',
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc(metrics['prompt_tokens'])
LLM_TOKENS_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
token_type='completion',
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc(metrics['completion_tokens'])
# Track duration
LLM_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).observe(metrics['time_elapsed'])
# Track call count
LLM_CALLS_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
interaction_type=interaction_type,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
def reset_llm_metrics(self): def reset_llm_metrics(self):
self.llm_metrics['total_tokens'] = 0 self.llm_metrics['total_tokens'] = 0
self.llm_metrics['prompt_tokens'] = 0 self.llm_metrics['prompt_tokens'] = 0
@@ -86,6 +241,26 @@ class BusinessEvent:
# Track start time for the span # Track start time for the span
span_start_time = time.time() span_start_time = time.time()
# Increment span metrics - using span_name as activity_name for metrics
SPAN_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
# Increment concurrent spans gauge
CONCURRENT_SPANS.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
self.log(f"Start") self.log(f"Start")
try: try:
@@ -94,6 +269,104 @@ class BusinessEvent:
# Calculate total time for this span # Calculate total time for this span
span_total_time = time.time() - span_start_time span_total_time = time.time() - span_start_time
# Observe span duration
SPAN_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).observe(span_total_time)
# Decrement concurrent spans gauge
CONCURRENT_SPANS.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).dec()
if self.llm_metrics['call_count'] > 0:
self.log_final_metrics()
self.reset_llm_metrics()
self.log(f"End", extra_fields={'span_duration': span_total_time})
# Restore the previous span info
if self.spans:
self.span_id, self.span_name, self.parent_span_id = self.spans.pop()
else:
self.span_id = None
self.span_name = None
self.parent_span_id = None
@asynccontextmanager
async def create_span_async(self, span_name: str):
"""Async version of create_span using async context manager"""
parent_span_id = self.span_id
self.span_counter += 1
new_span_id = str(uuid.uuid4())
# Save the current span info
self.spans.append((self.span_id, self.span_name, self.parent_span_id))
# Set the new span info
self.span_id = new_span_id
self.span_name = span_name
self.parent_span_id = parent_span_id
# Track start time for the span
span_start_time = time.time()
# Increment span metrics - using span_name as activity_name for metrics
SPAN_COUNTER.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
# Increment concurrent spans gauge
CONCURRENT_SPANS.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).inc()
self.log(f"Start")
try:
yield
finally:
# Calculate total time for this span
span_total_time = time.time() - span_start_time
# Observe span duration
SPAN_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).observe(span_total_time)
# Decrement concurrent spans gauge
CONCURRENT_SPANS.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
activity_name=span_name,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).dec()
if self.llm_metrics['call_count'] > 0: if self.llm_metrics['call_count'] > 0:
self.log_final_metrics() self.log_final_metrics()
self.reset_llm_metrics() self.reset_llm_metrics()
@@ -119,6 +392,9 @@ class BusinessEvent:
'document_version_file_size': self.document_version_file_size, 'document_version_file_size': self.document_version_file_size,
'chat_session_id': self.chat_session_id, 'chat_session_id': self.chat_session_id,
'interaction_id': self.interaction_id, 'interaction_id': self.interaction_id,
'specialist_id': self.specialist_id,
'specialist_type': self.specialist_type,
'specialist_type_version': self.specialist_type_version,
'environment': self.environment, 'environment': self.environment,
'message': message, 'message': message,
} }
@@ -149,6 +425,9 @@ class BusinessEvent:
'document_version_file_size': self.document_version_file_size, 'document_version_file_size': self.document_version_file_size,
'chat_session_id': self.chat_session_id, 'chat_session_id': self.chat_session_id,
'interaction_id': self.interaction_id, 'interaction_id': self.interaction_id,
'specialist_id': self.specialist_id,
'specialist_type': self.specialist_type,
'specialist_type_version': self.specialist_type_version,
'environment': self.environment, 'environment': self.environment,
'llm_metrics_total_tokens': metrics['total_tokens'], 'llm_metrics_total_tokens': metrics['total_tokens'],
'llm_metrics_prompt_tokens': metrics['prompt_tokens'], 'llm_metrics_prompt_tokens': metrics['prompt_tokens'],
@@ -174,6 +453,9 @@ class BusinessEvent:
'document_version_file_size': self.document_version_file_size, 'document_version_file_size': self.document_version_file_size,
'chat_session_id': self.chat_session_id, 'chat_session_id': self.chat_session_id,
'interaction_id': self.interaction_id, 'interaction_id': self.interaction_id,
'specialist_id': self.specialist_id,
'specialist_type': self.specialist_type,
'specialist_type_version': self.specialist_type_version,
'environment': self.environment, 'environment': self.environment,
'llm_metrics_total_tokens': self.llm_metrics['total_tokens'], 'llm_metrics_total_tokens': self.llm_metrics['total_tokens'],
'llm_metrics_prompt_tokens': self.llm_metrics['prompt_tokens'], 'llm_metrics_prompt_tokens': self.llm_metrics['prompt_tokens'],
@@ -203,6 +485,9 @@ class BusinessEvent:
document_version_file_size=entry.pop('document_version_file_size', None), document_version_file_size=entry.pop('document_version_file_size', None),
chat_session_id=entry.pop('chat_session_id', None), chat_session_id=entry.pop('chat_session_id', None),
interaction_id=entry.pop('interaction_id', None), interaction_id=entry.pop('interaction_id', None),
specialist_id=entry.pop('specialist_id', None),
specialist_type=entry.pop('specialist_type', None),
specialist_type_version=entry.pop('specialist_type_version', None),
environment=entry.pop('environment', None), environment=entry.pop('environment', None),
llm_metrics_total_tokens=entry.pop('llm_metrics_total_tokens', None), llm_metrics_total_tokens=entry.pop('llm_metrics_total_tokens', None),
llm_metrics_prompt_tokens=entry.pop('llm_metrics_prompt_tokens', None), llm_metrics_prompt_tokens=entry.pop('llm_metrics_prompt_tokens', None),
@@ -249,6 +534,24 @@ class BusinessEvent:
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
trace_total_time = time.time() - self.trace_start_time trace_total_time = time.time() - self.trace_start_time
# Record trace duration
TRACE_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).observe(trace_total_time)
# Decrement concurrent traces gauge
CONCURRENT_TRACES.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).dec()
if self.llm_metrics['call_count'] > 0: if self.llm_metrics['call_count'] > 0:
self.log_final_metrics() self.log_final_metrics()
self.reset_llm_metrics() self.reset_llm_metrics()
@@ -256,3 +559,37 @@ class BusinessEvent:
self.log(f'Ending Trace for {self.event_type}', extra_fields={'trace_duration': trace_total_time}) self.log(f'Ending Trace for {self.event_type}', extra_fields={'trace_duration': trace_total_time})
self._flush_log_buffer() self._flush_log_buffer()
return BusinessEventContext(self).__exit__(exc_type, exc_val, exc_tb) return BusinessEventContext(self).__exit__(exc_type, exc_val, exc_tb)
async def __aenter__(self):
self.trace_start_time = time.time()
self.log(f'Starting Trace for {self.event_type}')
return await BusinessEventContext(self).__aenter__()
async def __aexit__(self, exc_type, exc_val, exc_tb):
trace_total_time = time.time() - self.trace_start_time
# Record trace duration
TRACE_DURATION.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).observe(trace_total_time)
# Decrement concurrent traces gauge
CONCURRENT_TRACES.labels(
tenant_id=self.tenant_id_str,
event_type=self.event_type,
specialist_id=self.specialist_id_str,
specialist_type=self.specialist_type_str,
specialist_type_version=self.specialist_type_version_str
).dec()
if self.llm_metrics['call_count'] > 0:
self.log_final_metrics()
self.reset_llm_metrics()
self.log(f'Ending Trace for {self.event_type}', extra_fields={'trace_duration': trace_total_time})
self._flush_log_buffer()
return await BusinessEventContext(self).__aexit__(exc_type, exc_val, exc_tb)

View File

@@ -1,9 +1,22 @@
from werkzeug.local import LocalProxy, LocalStack from werkzeug.local import LocalProxy, LocalStack
import asyncio
from contextvars import ContextVar
import contextvars
# Keep existing stack for backward compatibility
_business_event_stack = LocalStack() _business_event_stack = LocalStack()
# Add contextvar for async support
_business_event_contextvar = ContextVar('business_event', default=None)
def _get_current_event(): def _get_current_event():
# Try contextvar first (for async)
event = _business_event_contextvar.get()
if event is not None:
return event
# Fall back to the stack-based approach (for sync)
top = _business_event_stack.top top = _business_event_stack.top
if top is None: if top is None:
raise RuntimeError("No business event context found. Are you sure you're in a business event?") raise RuntimeError("No business event context found. Are you sure you're in a business event?")
@@ -16,10 +29,24 @@ current_event = LocalProxy(_get_current_event)
class BusinessEventContext: class BusinessEventContext:
def __init__(self, event): def __init__(self, event):
self.event = event self.event = event
self._token = None # For storing contextvar token
def __enter__(self): def __enter__(self):
_business_event_stack.push(self.event) _business_event_stack.push(self.event)
self._token = _business_event_contextvar.set(self.event)
return self.event return self.event
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
_business_event_stack.pop() _business_event_stack.pop()
if self._token is not None:
_business_event_contextvar.reset(self._token)
async def __aenter__(self):
_business_event_stack.push(self.event)
self._token = _business_event_contextvar.set(self.event)
return self.event
async def __aexit__(self, exc_type, exc_val, exc_tb):
_business_event_stack.pop()
if self._token is not None:
_business_event_contextvar.reset(self._token)

View File

@@ -59,7 +59,7 @@ class CacheHandler(Generic[T]):
Returns: Returns:
A serializable format of the instance. A serializable format of the instance.
""" """
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _from_cache_data(self, data: Any, **kwargs) -> T: def _from_cache_data(self, data: Any, **kwargs) -> T:
@@ -73,7 +73,7 @@ class CacheHandler(Generic[T]):
Returns: Returns:
The data in its usable format. The data in its usable format.
""" """
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _should_cache(self, value: T) -> bool: def _should_cache(self, value: T) -> bool:
@@ -86,7 +86,7 @@ class CacheHandler(Generic[T]):
Returns: Returns:
True if the value should be cached, False otherwise. True if the value should be cached, False otherwise.
""" """
pass raise NotImplementedError
def configure_keys(self, *components: str): def configure_keys(self, *components: str):
""" """

View File

@@ -0,0 +1,59 @@
import time
import threading
from contextlib import contextmanager
from functools import wraps
from prometheus_client import Counter, Histogram, Summary, start_http_server, Gauge
from flask import current_app, g, request, Flask
class EveAIMetrics:
"""
Central class for Prometheus metrics infrastructure.
This class initializes the Prometheus HTTP server and provides
shared functionality for metrics across components.
Component-specific metrics should be defined in their respective modules.
"""
def __init__(self, app: Flask = None):
self.app = app
self._metrics_server_started = False
if app is not None:
self.init_app(app)
def init_app(self, app: Flask):
"""Initialize metrics with Flask app and start Prometheus server"""
self.app = app
self._start_metrics_server()
def _start_metrics_server(self):
"""Start the Prometheus metrics HTTP server if not already running"""
if not self._metrics_server_started:
try:
metrics_port = self.app.config.get('PROMETHEUS_PORT', 8000)
start_http_server(metrics_port)
self.app.logger.info(f"Prometheus metrics server started on port {metrics_port}")
self._metrics_server_started = True
except Exception as e:
self.app.logger.error(f"Failed to start metrics server: {e}")
@staticmethod
def get_standard_buckets():
"""
Return the standard duration buckets for histogram metrics.
Components should use these for consistency across the system.
"""
return [0.1, 0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 240, 360, float('inf')]
@staticmethod
def sanitize_label_values(labels_dict):
"""
Convert all label values to strings as required by Prometheus.
Args:
labels_dict: Dictionary of label name to label value
Returns:
Dictionary with all values converted to strings
"""
return {k: str(v) if v is not None else "" for k, v in labels_dict.items()}

View File

@@ -80,6 +80,8 @@ services:
- linux/arm64 - linux/arm64
ports: ports:
- 5001:5001 - 5001:5001
expose:
- 8000
environment: environment:
<<: *common-variables <<: *common-variables
COMPONENT_NAME: eveai_app COMPONENT_NAME: eveai_app
@@ -115,6 +117,8 @@ services:
platforms: platforms:
- linux/amd64 - linux/amd64
- linux/arm64 - linux/arm64
expose:
- 8000
environment: environment:
<<: *common-variables <<: *common-variables
COMPONENT_NAME: eveai_workers COMPONENT_NAME: eveai_workers
@@ -177,6 +181,8 @@ services:
platforms: platforms:
- linux/amd64 - linux/amd64
- linux/arm64 - linux/arm64
expose:
- 8000
environment: environment:
<<: *common-variables <<: *common-variables
COMPONENT_NAME: eveai_chat_workers COMPONENT_NAME: eveai_chat_workers
@@ -205,6 +211,8 @@ services:
- linux/arm64 - linux/arm64
ports: ports:
- 5003:5003 - 5003:5003
expose:
- 8000
environment: environment:
<<: *common-variables <<: *common-variables
COMPONENT_NAME: eveai_api COMPONENT_NAME: eveai_api
@@ -266,6 +274,8 @@ services:
platforms: platforms:
- linux/amd64 - linux/amd64
- linux/arm64 - linux/arm64
expose:
- 8000
environment: environment:
<<: *common-variables <<: *common-variables
COMPONENT_NAME: eveai_entitlements COMPONENT_NAME: eveai_entitlements
@@ -361,6 +371,42 @@ services:
networks: networks:
- eveai-network - eveai-network
prometheus:
image: prom/prometheus:latest
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
networks:
- eveai-network
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
restart: unless-stopped
depends_on:
- prometheus
networks:
- eveai-network
networks: networks:
eveai-network: eveai-network:
driver: bridge driver: bridge

View File

@@ -0,0 +1,627 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 1,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"color": {
"fixedColor": "#76599a",
"mode": "fixed"
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "sum(increase(eveai_business_events_total[$__interval])) by (event_type)",
"refId": "A"
}
],
"title": "Business Events by Type",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 5
},
{
"color": "red",
"value": 10
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.5.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "sum(eveai_business_events_concurrent)",
"refId": "A"
}
],
"title": "Concurrent Business Events",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"mappings": []
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"pieType": "pie",
"reduceOptions": {
"calcs": [
"sum"
],
"fields": "",
"values": false
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "9.5.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "sum(increase(eveai_business_events_total[$__range])) by (specialist_type)",
"refId": "A"
}
],
"title": "Events by Specialist Type",
"type": "piechart"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "histogram_quantile(0.95, sum(rate(eveai_business_events_duration_seconds_bucket[$__interval])) by (le, event_type))",
"refId": "A"
}
],
"title": "Business Event Duration (95th percentile)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 16
},
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "sum(increase(eveai_business_spans_total[$__interval])) by (activity_name)",
"refId": "A"
}
],
"title": "Activity Execution Count",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 60,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 0,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "sum(increase(eveai_llm_tokens_total[$__interval])) by (token_type)",
"refId": "A"
}
],
"title": "LLM Token Usage by Type",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "histogram_quantile(0.95, sum(rate(eveai_llm_duration_seconds_bucket[$__interval])) by (le, interaction_type))",
"refId": "A"
}
],
"title": "LLM Duration by Interaction Type (95th percentile)",
"type": "timeseries"
}
],
"refresh": "15m",
"schemaVersion": 38,
"style": "dark",
"tags": ["eveai", "system"],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Prometheus",
"value": "PBFA97CFB590B2093"
},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "EveAI System Dashboard",
"uid": "eveai-system-dashboard",
"version": 1,
"weekStart": ""
}

View File

@@ -0,0 +1,8 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true

View File

@@ -0,0 +1,34 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'eveai_app'
static_configs:
- targets: ['eveai_app:8000']
scrape_interval: 10s
- job_name: 'eveai_workers'
static_configs:
- targets: ['eveai_workers:8000']
scrape_interval: 10s
- job_name: 'eveai_chat_workers'
static_configs:
- targets: ['eveai_chat_workers:8000']
scrape_interval: 10s
- job_name: 'eveai_api'
static_configs:
- targets: ['eveai_api:8000']
scrape_interval: 10s
- job_name: 'eveai_entitlements'
static_configs:
- targets: ['eveai_entitlements:8000']
scrape_interval: 10s

View File

@@ -5,7 +5,7 @@ import os
from common.langchain.templates.template_manager import TemplateManager from common.langchain.templates.template_manager import TemplateManager
from common.utils.celery_utils import make_celery, init_celery from common.utils.celery_utils import make_celery, init_celery
from common.extensions import db, template_manager, cache_manager from common.extensions import db, template_manager, cache_manager, eveai_metrics
from config.logging_config import LOGGING from config.logging_config import LOGGING
from config.config import get_config from config.config import get_config
@@ -45,6 +45,7 @@ def register_extensions(app):
db.init_app(app) db.init_app(app)
cache_manager.init_app(app) cache_manager.init_app(app)
template_manager.init_app(app) template_manager.init_app(app)
eveai_metrics.init_app(app)
def register_cache_handlers(app): def register_cache_handlers(app):

View File

@@ -27,7 +27,7 @@ class BaseRetriever(ABC):
@abstractmethod @abstractmethod
def type(self) -> str: def type(self) -> str:
"""The type of the retriever""" """The type of the retriever"""
pass raise NotImplementedError
def _setup_tuning_logger(self): def _setup_tuning_logger(self):
try: try:
@@ -86,4 +86,4 @@ class BaseRetriever(ABC):
Returns: Returns:
List[Dict[str, Any]]: List of retrieved documents/content List[Dict[str, Any]]: List of retrieved documents/content
""" """
pass raise NotImplementedError

View File

@@ -123,10 +123,10 @@ class RAGFlow(EveAICrewAIFlow[RAGFlowState]):
return "" return ""
@listen(process_inputs) @listen(process_inputs)
def execute_rag(self): async def execute_rag(self):
inputs = self.state.input.model_dump() inputs = self.state.input.model_dump()
try: try:
crew_output = self.rag_crew.kickoff(inputs=inputs) crew_output = await self.rag_crew.kickoff_async(inputs=inputs)
self.specialist_executor.log_tuning("RAG Crew Output", crew_output.model_dump()) self.specialist_executor.log_tuning("RAG Crew Output", crew_output.model_dump())
output_pydantic = crew_output.pydantic output_pydantic = crew_output.pydantic
if not output_pydantic: if not output_pydantic:
@@ -139,13 +139,13 @@ class RAGFlow(EveAICrewAIFlow[RAGFlowState]):
self.exception_raised = True self.exception_raised = True
raise e raise e
def kickoff(self, inputs=None): async def execute_async(self, inputs=None):
with current_event.create_span("RAG Specialist Execution"): async with current_event.create_span_async("RAG Specialist Execution"):
self.specialist_executor.log_tuning("Inputs retrieved", inputs) self.specialist_executor.log_tuning("Inputs retrieved", inputs)
self.state.input = RAGSpecialistInput.model_validate(inputs) self.state.input = RAGSpecialistInput.model_validate(inputs)
self.specialist.update_progress("EveAI Flow Start", {"name": "RAG"}) self.specialist.update_progress("EveAI Flow Start", {"name": "RAG"})
try: try:
result = super().kickoff() result = await super().kickoff_async()
except Exception as e: except Exception as e:
current_app.logger.error(f"Error kicking of Flow: {str(e)}") current_app.logger.error(f"Error kicking of Flow: {str(e)}")

View File

@@ -1,3 +1,4 @@
import asyncio
import json import json
from os import wait from os import wait
from typing import Optional, List from typing import Optional, List
@@ -136,8 +137,7 @@ class SpecialistExecutor(CrewAIBaseSpecialistExecutor):
"nr_of_questions": self.specialist.configuration.get('nr_of_questions', ''), "nr_of_questions": self.specialist.configuration.get('nr_of_questions', ''),
"identification": arguments.identification, "identification": arguments.identification,
} }
# crew_results = self.rag_crew.kickoff(inputs=flow_inputs)
# current_app.logger.debug(f"Test Crew Output received: {crew_results}")
flow_results = self.flow.kickoff(inputs=flow_inputs) flow_results = self.flow.kickoff(inputs=flow_inputs)
flow_state = self.flow.state flow_state = self.flow.state
@@ -214,10 +214,10 @@ class SPINFlow(EveAICrewAIFlow[SPINFlowState]):
return "" return ""
@listen(process_inputs) @listen(process_inputs)
def execute_rag(self): async def execute_rag(self):
inputs = self.state.input.model_dump() inputs = self.state.input.model_dump()
try: try:
crew_output = self.rag_crew.kickoff(inputs=inputs) crew_output = await self.rag_crew.kickoff_async(inputs=inputs)
self.specialist_executor.log_tuning("RAG Crew Output", crew_output.model_dump()) self.specialist_executor.log_tuning("RAG Crew Output", crew_output.model_dump())
output_pydantic = crew_output.pydantic output_pydantic = crew_output.pydantic
if not output_pydantic: if not output_pydantic:
@@ -231,10 +231,11 @@ class SPINFlow(EveAICrewAIFlow[SPINFlowState]):
raise e raise e
@listen(process_inputs) @listen(process_inputs)
def execute_spin(self): async def execute_spin(self):
inputs = self.state.input.model_dump() inputs = self.state.input.model_dump()
try: try:
crew_output = self.spin_crew.kickoff(inputs=inputs) crew_output = await self.spin_crew.kickoff_async(inputs=inputs)
current_app.logger.info(f"SPIN Crew Executed, output: {crew_output.model_dump()}")
self.specialist_executor.log_tuning("Spin Crew Output", crew_output.model_dump()) self.specialist_executor.log_tuning("Spin Crew Output", crew_output.model_dump())
output_pydantic = crew_output.pydantic output_pydantic = crew_output.pydantic
if not output_pydantic: if not output_pydantic:
@@ -248,10 +249,10 @@ class SPINFlow(EveAICrewAIFlow[SPINFlowState]):
raise e raise e
@listen(process_inputs) @listen(process_inputs)
def execute_identification(self): async def execute_identification(self):
inputs = self.state.input.model_dump() inputs = self.state.input.model_dump()
try: try:
crew_output = self.identification_crew.kickoff(inputs=inputs) crew_output = await self.identification_crew.kickoff_async(inputs=inputs)
self.specialist_executor.log_tuning("Identification Crew Output", crew_output.model_dump()) self.specialist_executor.log_tuning("Identification Crew Output", crew_output.model_dump())
output_pydantic = crew_output.pydantic output_pydantic = crew_output.pydantic
if not output_pydantic: if not output_pydantic:
@@ -265,7 +266,7 @@ class SPINFlow(EveAICrewAIFlow[SPINFlowState]):
raise e raise e
@listen(and_(execute_rag, execute_spin, execute_identification)) @listen(and_(execute_rag, execute_spin, execute_identification))
def consolidate(self): async def consolidate(self):
inputs = self.state.input.model_dump() inputs = self.state.input.model_dump()
if self.state.rag_output: if self.state.rag_output:
inputs["prepared_answers"] = self.state.rag_output.answer inputs["prepared_answers"] = self.state.rag_output.answer
@@ -277,7 +278,7 @@ class SPINFlow(EveAICrewAIFlow[SPINFlowState]):
current_app.logger.debug(f"Additional Questions: {additional_questions}") current_app.logger.debug(f"Additional Questions: {additional_questions}")
inputs["additional_questions"] = additional_questions inputs["additional_questions"] = additional_questions
try: try:
crew_output = self.rag_consolidation_crew.kickoff(inputs=inputs) crew_output = await self.rag_consolidation_crew.kickoff_async(inputs=inputs)
self.specialist_executor.log_tuning("RAG Consolidation Crew Output", crew_output.model_dump()) self.specialist_executor.log_tuning("RAG Consolidation Crew Output", crew_output.model_dump())
output_pydantic = crew_output.pydantic output_pydantic = crew_output.pydantic
if not output_pydantic: if not output_pydantic:
@@ -290,13 +291,16 @@ class SPINFlow(EveAICrewAIFlow[SPINFlowState]):
self.exception_raised = True self.exception_raised = True
raise e raise e
def kickoff(self, inputs=None): async def execute_async(self, inputs=None):
with current_event.create_span("SPIN Specialist Execution"): current_app.logger.debug(f"Async kickoff {self.name}")
async with current_event.create_span_async("SPIN Specialist Execution"):
self.specialist_executor.log_tuning("Inputs retrieved", inputs) self.specialist_executor.log_tuning("Inputs retrieved", inputs)
self.state.input = SPINSpecialistInput.model_validate(inputs) self.state.input = SPINSpecialistInput.model_validate(inputs)
self.specialist.update_progress("EveAI Flow Start", {"name": "SPIN"}) self.specialist.update_progress("EveAI Flow Start", {"name": "SPIN"})
try: try:
result = super().kickoff() current_app.logger.debug(f"Async super kickoff {self.name}")
result = await super().kickoff_async()
current_app.logger.debug(f"Async super kickoff {self.name} ended")
except Exception as e: except Exception as e:
current_app.logger.error(f"Error kicking of Flow: {str(e)}") current_app.logger.error(f"Error kicking of Flow: {str(e)}")

View File

@@ -28,13 +28,13 @@ class BaseSpecialistExecutor(ABC):
@abstractmethod @abstractmethod
def type(self) -> str: def type(self) -> str:
"""The type of the specialist""" """The type of the specialist"""
pass raise NotImplementedError
@property @property
@abstractmethod @abstractmethod
def type_version(self) -> str: def type_version(self) -> str:
"""The type version of the specialist""" """The type version of the specialist"""
pass raise NotImplementedError
def _initialize_retrievers(self) -> List[BaseRetriever]: def _initialize_retrievers(self) -> List[BaseRetriever]:
"""Initialize all retrievers associated with this specialist""" """Initialize all retrievers associated with this specialist"""
@@ -96,7 +96,7 @@ class BaseSpecialistExecutor(ABC):
@abstractmethod @abstractmethod
def execute_specialist(self, arguments: SpecialistArguments) -> SpecialistResult: def execute_specialist(self, arguments: SpecialistArguments) -> SpecialistResult:
"""Execute the specialist's logic""" """Execute the specialist's logic"""
pass raise NotImplementedError
def get_specialist_class(specialist_type: str, type_version: str): def get_specialist_class(specialist_type: str, type_version: str):

View File

@@ -1,5 +1,7 @@
import asyncio
import json import json
import time import time
from abc import abstractmethod
from crewai import Agent, Task, Crew, Flow from crewai import Agent, Task, Crew, Flow
from crewai.agents.parser import AgentAction, AgentFinish from crewai.agents.parser import AgentAction, AgentFinish
@@ -21,8 +23,6 @@ class EveAICrewAIAgent(Agent):
super().__init__(**kwargs) super().__init__(**kwargs)
self.specialist = specialist self.specialist = specialist
self.name = name self.name = name
self.specialist.log_tuning("Initializing EveAICrewAIAgent", {"name": name})
self.specialist.update_progress("EveAI Agent Initialisation", {"name": self.name})
def execute_task( def execute_task(
self, self,
@@ -40,28 +40,30 @@ class EveAICrewAIAgent(Agent):
Returns: Returns:
Output of the agent Output of the agent
""" """
with current_event.create_span(f"Task Execution {task.name} by {self.name}"): current_app.logger.debug(f"Task Execution {task.name} by {self.name}")
self.specialist.log_tuning("EveAI Agent Task Start", # with current_event.create_span(f"Task Execution {task.name} by {self.name}"):
{"name": self.name, self.specialist.log_tuning("EveAI Agent Task Start",
'task': task.name, {"name": self.name,
}) 'task': task.name,
self.specialist.update_progress("EveAI Agent Task Start", })
{"name": self.name, self.specialist.update_progress("EveAI Agent Task Start",
'task': task.name, {"name": self.name,
}) 'task': task.name,
})
result = super().execute_task(task, context, tools) result = super().execute_task(task, context, tools)
self.specialist.log_tuning("EveAI Agent Task Complete", self.specialist.log_tuning("EveAI Agent Task Complete",
{"name": self.name, {"name": self.name,
'task': task.name, 'task': task.name,
'result': result, 'result': result,
}) })
self.specialist.update_progress("EveAI Agent Task Complete", self.specialist.update_progress("EveAI Agent Task Complete",
{"name": self.name, {"name": self.name,
'task': task.name, 'task': task.name,
}) })
current_app.logger.debug(f"Task Execution Ended {task.name} by {self.name}")
return result return result
@@ -76,8 +78,6 @@ class EveAICrewAITask(Task):
# current_app.logger.debug(f"Task pydantic class for {name}: {"class", self.output_pydantic}") # current_app.logger.debug(f"Task pydantic class for {name}: {"class", self.output_pydantic}")
self.specialist = specialist self.specialist = specialist
self.name = name self.name = name
self.specialist.log_tuning("Initializing EveAICrewAITask", {"name": name})
self.specialist.update_progress("EveAI Task Initialisation", {"name": name})
class EveAICrewAICrew(Crew): class EveAICrewAICrew(Crew):
@@ -89,12 +89,10 @@ class EveAICrewAICrew(Crew):
super().__init__(**kwargs) super().__init__(**kwargs)
self.specialist = specialist self.specialist = specialist
self.name = name self.name = name
self.specialist.log_tuning("Initializing EveAICrewAICrew", {"name": self.name})
self.specialist.update_progress("EveAI Crew Initialisation", {"name": self.name})
def kickoff( def kickoff(
self, self,
inputs: Optional[Dict[str, Any]] = None, inputs: Optional[Dict[str, Any]] = None,
) -> CrewOutput: ) -> CrewOutput:
with current_event.create_span(f"Crew {self.name} kickoff"): with current_event.create_span(f"Crew {self.name} kickoff"):
start_time = time.time() start_time = time.time()
@@ -111,6 +109,26 @@ class EveAICrewAICrew(Crew):
return results return results
async def kickoff_async(
self,
inputs: Optional[Dict[str, Any]] = None,
) -> CrewOutput:
async with current_event.create_span_async(f"Crew {self.name} kickoff"):
start_time = time.time()
results = await super().kickoff_async(inputs)
end_time = time.time()
metrics = {
"total_tokens": self.usage_metrics.total_tokens,
"prompt_tokens": self.usage_metrics.prompt_tokens,
"completion_tokens": self.usage_metrics.completion_tokens,
"time_elapsed": end_time - start_time,
"interaction_type": "Crew Execution"
}
current_event.log_llm_metrics(metrics)
return results
class EveAICrewAIFlow(Flow): class EveAICrewAIFlow(Flow):
specialist: Any = Field(default=None, exclude=True) specialist: Any = Field(default=None, exclude=True)
name: str = Field(default=None, exclude=True) name: str = Field(default=None, exclude=True)
@@ -123,10 +141,14 @@ class EveAICrewAIFlow(Flow):
self.specialist.log_tuning("Initializing EveAICrewAIFlow", {"name": self.name}) self.specialist.log_tuning("Initializing EveAICrewAIFlow", {"name": self.name})
self.specialist.update_progress("EveAI Flow Initialisation", {"name": self.name}) self.specialist.update_progress("EveAI Flow Initialisation", {"name": self.name})
def kickoff(self, inputs=None):
result = asyncio.run(self.execute_async(inputs=inputs))
@abstractmethod
async def execute_async(self, inputs=None):
raise NotImplementedError
class EveAIFlowState(BaseModel): class EveAIFlowState(BaseModel):
"""Base class for all EveAI flow states""" """Base class for all EveAI flow states"""
pass pass

View File

@@ -90,6 +90,7 @@ class CrewAIBaseSpecialistExecutor(BaseSpecialistExecutor):
def _config_task_agents(self): def _config_task_agents(self):
"""Configure the task agents by adding task-agent combinations. Use _add_task_agent() """Configure the task agents by adding task-agent combinations. Use _add_task_agent()
""" """
raise NotImplementedError
@property @property
def task_agents(self) -> Dict[str, str]: def task_agents(self) -> Dict[str, str]:
@@ -103,6 +104,7 @@ class CrewAIBaseSpecialistExecutor(BaseSpecialistExecutor):
@abstractmethod @abstractmethod
def _config_pydantic_outputs(self): def _config_pydantic_outputs(self):
"""Configure the task pydantic outputs by adding task-output combinations. Use _add_pydantic_output()""" """Configure the task pydantic outputs by adding task-output combinations. Use _add_pydantic_output()"""
raise NotImplementedError
@property @property
def task_pydantic_outputs(self): def task_pydantic_outputs(self):
@@ -203,6 +205,7 @@ class CrewAIBaseSpecialistExecutor(BaseSpecialistExecutor):
def _instantiate_specialist(self): def _instantiate_specialist(self):
"""Instantiate a crew (or flow) to set up the complete specialist, using the assets (agents, tasks, tools). """Instantiate a crew (or flow) to set up the complete specialist, using the assets (agents, tasks, tools).
The assets can be retrieved using their type name in lower case, e.g. rag_agent""" The assets can be retrieved using their type name in lower case, e.g. rag_agent"""
raise NotImplementedError
def _detail_question(self, language: str, question: str) -> str: def _detail_question(self, language: str, question: str) -> str:
"""Detail question based on conversation history""" """Detail question based on conversation history"""
@@ -211,9 +214,7 @@ class CrewAIBaseSpecialistExecutor(BaseSpecialistExecutor):
# Get LLM and template # Get LLM and template
llm = self.model_variables.get_llm(temperature=0.3) llm = self.model_variables.get_llm(temperature=0.3)
template = cache_manager.prompts_config_cache.get_config('history').get('content', '') template = cache_manager.prompts_config_cache.get_config('history').get('content', '')
current_app.logger.debug(f"History Template: {template}")
language_template = create_language_template(template, language) language_template = create_language_template(template, language)
current_app.logger.debug(f"History Language Template: {template}")
# Create prompt # Create prompt
history_prompt = ChatPromptTemplate.from_template(language_template) history_prompt = ChatPromptTemplate.from_template(language_template)
@@ -226,7 +227,6 @@ class CrewAIBaseSpecialistExecutor(BaseSpecialistExecutor):
) )
# Execute chain # Execute chain
current_app.logger.debug(f"Formatted History: {self.formatted_history}")
detailed_question = chain.invoke({ detailed_question = chain.invoke({
"history": self.formatted_history, "history": self.formatted_history,
"question": question "question": question
@@ -254,7 +254,6 @@ class CrewAIBaseSpecialistExecutor(BaseSpecialistExecutor):
"num_retrievers": len(self.retrievers), "num_retrievers": len(self.retrievers),
"all arguments": arguments.model_dump(), "all arguments": arguments.model_dump(),
}) })
current_app.logger.debug(f"Retrieving context from arguments: {arguments}")
original_query = arguments.query original_query = arguments.query
detailed_query = self._detail_question(arguments.language, original_query) detailed_query = self._detail_question(arguments.language, original_query)
@@ -289,7 +288,6 @@ class CrewAIBaseSpecialistExecutor(BaseSpecialistExecutor):
retriever_args = RetrieverArguments(**current_retriever_args) retriever_args = RetrieverArguments(**current_retriever_args)
# Each retriever gets its own specific arguments # Each retriever gets its own specific arguments
current_app.logger.debug(f"Retrieving context {retriever_id} with arguments {retriever_args}")
retriever_result = retriever.retrieve(retriever_args) retriever_result = retriever.retrieve(retriever_args)
all_context.extend(retriever_result) all_context.extend(retriever_result)
@@ -326,7 +324,7 @@ class CrewAIBaseSpecialistExecutor(BaseSpecialistExecutor):
@abstractmethod @abstractmethod
def execute(self, arguments: SpecialistArguments, formatted_context: str, citations: List[int]) -> SpecialistResult: def execute(self, arguments: SpecialistArguments, formatted_context: str, citations: List[int]) -> SpecialistResult:
pass raise NotImplementedError
def execute_specialist(self, arguments: SpecialistArguments) -> SpecialistResult: def execute_specialist(self, arguments: SpecialistArguments) -> SpecialistResult:
# Detail the incoming query # Detail the incoming query

View File

@@ -226,26 +226,39 @@ def execute_specialist(self, tenant_id: int, specialist_id: int, arguments: Dict
task_id = self.request.id task_id = self.request.id
ept = ExecutionProgressTracker() ept = ExecutionProgressTracker()
ept.send_update(task_id, "EveAI Specialist Started", {}) ept.send_update(task_id, "EveAI Specialist Started", {})
with BusinessEvent("Execute Specialist", tenant_id=tenant_id, chat_session_id=session_id) as event:
# Prepare context
try:
# Retrieve the tenant
tenant = Tenant.query.get(tenant_id)
if not tenant:
raise Exception(f'Tenant {tenant_id} not found.')
# Switch to correct database schema
Database(str(tenant_id)).switch_schema()
# Get specialist from database
specialist = Specialist.query.get_or_404(specialist_id)
except Exception as e:
ept.send_update(task_id, "EveAI Specialist Error", {'Error': str(e)})
current_app.logger.error(f'execute_specialist: Error executing specialist: {e}')
raise
with BusinessEvent("Execute Specialist",
tenant_id=tenant_id,
chat_session_id=session_id,
specialist_id=specialist_id,
specialist_type=specialist.type,
specialist_type_version=specialist.type_version) as event:
current_app.logger.info( current_app.logger.info(
f'execute_specialist: Processing request for tenant {tenant_id} using specialist {specialist_id}') f'execute_specialist: Processing request for tenant {tenant_id} using specialist {specialist_id}')
try: try:
# Retrieve the tenant
tenant = Tenant.query.get(tenant_id)
if not tenant:
raise Exception(f'Tenant {tenant_id} not found.')
# Switch to correct database schema
Database(tenant_id).switch_schema()
# Ensure we have a session # Ensure we have a session
cached_session = cache_manager.chat_session_cache.get_cached_session( cached_session = cache_manager.chat_session_cache.get_cached_session(
session_id, session_id,
create_params={'timezone': user_timezone} create_params={'timezone': user_timezone}
) )
# Get specialist from database
specialist = Specialist.query.get_or_404(specialist_id)
# Prepare complete arguments # Prepare complete arguments
try: try:

View File

@@ -88,6 +88,9 @@ def persist_business_events(log_entries):
parent_span_id=entry.pop('parent_span_id', None), parent_span_id=entry.pop('parent_span_id', None),
document_version_id=entry.pop('document_version_id', None), document_version_id=entry.pop('document_version_id', None),
document_version_file_size=entry.pop('document_version_file_size', None), document_version_file_size=entry.pop('document_version_file_size', None),
specialist_id=entry.pop('specialist_id', None),
specialist_type=entry.pop('specialist_type', None),
specialist_type_version=entry.pop('specialist_type_version', None),
chat_session_id=entry.pop('chat_session_id', None), chat_session_id=entry.pop('chat_session_id', None),
interaction_id=entry.pop('interaction_id', None), interaction_id=entry.pop('interaction_id', None),
environment=entry.pop('environment', None), environment=entry.pop('environment', None),

View File

@@ -40,7 +40,7 @@ class BaseProcessor(ABC):
@abstractmethod @abstractmethod
def process(self): def process(self):
pass raise NotImplementedError
@property @property
def configuration(self): def configuration(self):

View File

@@ -0,0 +1,40 @@
"""Adding specialist information to BusinessEventLog
Revision ID: 03a1e7633c01
Revises: 4d2842d9c1d0
Create Date: 2025-03-24 14:28:57.200173
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '03a1e7633c01'
down_revision = '4d2842d9c1d0'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('business_event_log', schema=None) as batch_op:
batch_op.add_column(sa.Column('specialist_id', sa.Integer(), nullable=True))
batch_op.add_column(sa.Column('specialist_type', sa.String(length=50), nullable=True))
batch_op.add_column(sa.Column('specialist_type_version', sa.String(length=20), nullable=True))
batch_op.drop_constraint('business_event_log_license_usage_id_fkey', type_='foreignkey')
batch_op.create_foreign_key(None, 'license_usage', ['license_usage_id'], ['id'], referent_schema='public')
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('business_event_log', schema=None) as batch_op:
batch_op.drop_constraint(None, type_='foreignkey')
batch_op.create_foreign_key('business_event_log_license_usage_id_fkey', 'license_usage', ['license_usage_id'], ['id'])
batch_op.drop_column('specialist_type_version')
batch_op.drop_column('specialist_type')
batch_op.drop_column('specialist_id')
# ### end Alembic commands ###

View File

@@ -70,19 +70,17 @@ pillow~=10.4.0
pdfplumber~=0.11.4 pdfplumber~=0.11.4
PyPDF2~=3.0.1 PyPDF2~=3.0.1
flask-restx~=1.3.0 flask-restx~=1.3.0
prometheus-flask-exporter~=0.23.1
flask-healthz~=1.0.1 flask-healthz~=1.0.1
langsmith~=0.1.121 langsmith~=0.1.121
anthropic~=0.34.2 anthropic~=0.34.2
prometheus-client~=0.20.0 prometheus-client~=0.21.1
prometheus-flask-exporter~=0.23.2
flower~=2.0.1 flower~=2.0.1
psutil~=6.0.0 psutil~=6.0.0
celery-redbeat~=2.2.0 celery-redbeat~=2.2.0
WTForms-SQLAlchemy~=0.4.1 WTForms-SQLAlchemy~=0.4.1
packaging~=24.1 packaging~=24.1
typing_extensions~=4.12.2 typing_extensions~=4.12.2
prometheus_flask_exporter~=0.23.1
prometheus_client~=0.20.0
babel~=2.16.0 babel~=2.16.0
dogpile.cache~=1.3.3 dogpile.cache~=1.3.3
python-docx~=1.1.2 python-docx~=1.1.2
@@ -90,4 +88,5 @@ crewai~=0.108.0
sseclient~=0.0.27 sseclient~=0.0.27
termcolor~=2.5.0 termcolor~=2.5.0
mistral-common~=1.5.3 mistral-common~=1.5.3
mistralai~=1.5.0 mistralai~=1.5.0
contextvars~=2.4

30
scripts/reload-prometheus.sh Executable file
View File

@@ -0,0 +1,30 @@
#!/bin/bash
# Check if the service name is provided
if [ -z "$1" ]; then
echo "Usage: $0 <service_name> (usually 'prometheus')"
exit 1
fi
SERVICE_NAME=$1
# Get the container ID of the service
CONTAINER_ID=$(docker-compose ps -q $SERVICE_NAME)
# Check if the container ID is found
if [ -z "$CONTAINER_ID" ]; then
echo "Service $SERVICE_NAME not found or not running."
exit 1
fi
# Reload Prometheus configuration
echo "Reloading Prometheus configuration..."
curl -X POST http://localhost:9090/-/reload
# Output the result
if [ $? -eq 0 ]; then
echo "Prometheus configuration reloaded successfully."
else
echo "Failed to reload Prometheus configuration."
exit 1
fi