- Adding Prometheus and grafana services in development
- Adding Prometheus metrics to the business events - Ensure asynchronous behaviour of crewai specialists. - Adapt Business events to working in mixed synchronous / asynchronous contexts - Extend business events with specialist information - Started adding a grafana dashboard (TBC)
This commit is contained in:
@@ -5,7 +5,7 @@ from typing import List
|
||||
class EveAIEmbeddings:
|
||||
@abstractmethod
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
return self.embed_documents([text])[0]
|
||||
@@ -15,6 +15,7 @@ from .langchain.templates.template_manager import TemplateManager
|
||||
from .utils.cache.eveai_cache_manager import EveAICacheManager
|
||||
from .utils.simple_encryption import SimpleEncryption
|
||||
from .utils.minio_utils import MinioClient
|
||||
from .utils.performance_monitoring import EveAIMetrics
|
||||
|
||||
|
||||
# Create extensions
|
||||
@@ -33,6 +34,6 @@ simple_encryption = SimpleEncryption()
|
||||
minio_client = MinioClient()
|
||||
metrics = PrometheusMetrics.for_app_factory()
|
||||
template_manager = TemplateManager()
|
||||
# Caching classes
|
||||
cache_manager = EveAICacheManager()
|
||||
eveai_metrics = EveAIMetrics()
|
||||
|
||||
|
||||
@@ -15,6 +15,9 @@ class BusinessEventLog(db.Model):
|
||||
parent_span_id = db.Column(db.String(50))
|
||||
document_version_id = db.Column(db.Integer)
|
||||
document_version_file_size = db.Column(db.Float)
|
||||
specialist_id = db.Column(db.Integer)
|
||||
specialist_type = db.Column(db.String(50))
|
||||
specialist_type_version = db.Column(db.String(20))
|
||||
chat_session_id = db.Column(db.String(50))
|
||||
interaction_id = db.Column(db.Integer)
|
||||
environment = db.Column(db.String(20))
|
||||
|
||||
@@ -1,16 +1,81 @@
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import contextmanager
|
||||
from contextlib import contextmanager, asynccontextmanager
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional, List
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
import logging
|
||||
from prometheus_client import Counter, Histogram, Gauge, Summary
|
||||
|
||||
from .business_event_context import BusinessEventContext
|
||||
from common.models.entitlements import BusinessEventLog
|
||||
from common.extensions import db
|
||||
from .celery_utils import current_celery
|
||||
from common.utils.performance_monitoring import EveAIMetrics
|
||||
|
||||
# Standard duration buckets for all histograms
|
||||
DURATION_BUCKETS = EveAIMetrics.get_standard_buckets()
|
||||
|
||||
# Prometheus metrics for business events
|
||||
TRACE_COUNTER = Counter(
|
||||
'eveai_business_events_total',
|
||||
'Total number of business events triggered',
|
||||
['tenant_id', 'event_type', 'specialist_id', 'specialist_type', 'specialist_type_version']
|
||||
)
|
||||
|
||||
TRACE_DURATION = Histogram(
|
||||
'eveai_business_events_duration_seconds',
|
||||
'Duration of business events in seconds',
|
||||
['tenant_id', 'event_type', 'specialist_id', 'specialist_type', 'specialist_type_version'],
|
||||
buckets=DURATION_BUCKETS
|
||||
)
|
||||
|
||||
CONCURRENT_TRACES = Gauge(
|
||||
'eveai_business_events_concurrent',
|
||||
'Number of concurrent business events',
|
||||
['tenant_id', 'event_type', 'specialist_id', 'specialist_type', 'specialist_type_version']
|
||||
)
|
||||
|
||||
SPAN_COUNTER = Counter(
|
||||
'eveai_business_spans_total',
|
||||
'Total number of spans within business events',
|
||||
['tenant_id', 'event_type', 'activity_name', 'specialist_id', 'specialist_type', 'specialist_type_version']
|
||||
)
|
||||
|
||||
SPAN_DURATION = Histogram(
|
||||
'eveai_business_spans_duration_seconds',
|
||||
'Duration of spans within business events in seconds',
|
||||
['tenant_id', 'event_type', 'activity_name', 'specialist_id', 'specialist_type', 'specialist_type_version'],
|
||||
buckets=DURATION_BUCKETS
|
||||
)
|
||||
|
||||
CONCURRENT_SPANS = Gauge(
|
||||
'eveai_business_spans_concurrent',
|
||||
'Number of concurrent spans within business events',
|
||||
['tenant_id', 'event_type', 'activity_name', 'specialist_id', 'specialist_type', 'specialist_type_version']
|
||||
)
|
||||
|
||||
# LLM Usage metrics
|
||||
LLM_TOKENS_COUNTER = Counter(
|
||||
'eveai_llm_tokens_total',
|
||||
'Total number of tokens used in LLM calls',
|
||||
['tenant_id', 'event_type', 'interaction_type', 'token_type', 'specialist_id', 'specialist_type',
|
||||
'specialist_type_version']
|
||||
)
|
||||
|
||||
LLM_DURATION = Histogram(
|
||||
'eveai_llm_duration_seconds',
|
||||
'Duration of LLM API calls in seconds',
|
||||
['tenant_id', 'event_type', 'interaction_type', 'specialist_id', 'specialist_type', 'specialist_type_version'],
|
||||
buckets=DURATION_BUCKETS
|
||||
)
|
||||
|
||||
LLM_CALLS_COUNTER = Counter(
|
||||
'eveai_llm_calls_total',
|
||||
'Total number of LLM API calls',
|
||||
['tenant_id', 'event_type', 'interaction_type', 'specialist_id', 'specialist_type', 'specialist_type_version']
|
||||
)
|
||||
|
||||
|
||||
class BusinessEvent:
|
||||
@@ -29,6 +94,9 @@ class BusinessEvent:
|
||||
self.document_version_file_size = kwargs.get('document_version_file_size')
|
||||
self.chat_session_id = kwargs.get('chat_session_id')
|
||||
self.interaction_id = kwargs.get('interaction_id')
|
||||
self.specialist_id = kwargs.get('specialist_id')
|
||||
self.specialist_type = kwargs.get('specialist_type')
|
||||
self.specialist_type_version = kwargs.get('specialist_type_version')
|
||||
self.environment = os.environ.get("FLASK_ENV", "development")
|
||||
self.span_counter = 0
|
||||
self.spans = []
|
||||
@@ -42,9 +110,42 @@ class BusinessEvent:
|
||||
}
|
||||
self._log_buffer = []
|
||||
|
||||
# Prometheus label values must be strings
|
||||
self.tenant_id_str = str(self.tenant_id)
|
||||
self.specialist_id_str = str(self.specialist_id) if self.specialist_id else ""
|
||||
self.specialist_type_str = str(self.specialist_type) if self.specialist_type else ""
|
||||
self.specialist_type_version_str = str(self.specialist_type_version) if self.specialist_type_version else ""
|
||||
|
||||
# Increment concurrent events gauge when initialized
|
||||
CONCURRENT_TRACES.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc()
|
||||
|
||||
# Increment trace counter
|
||||
TRACE_COUNTER.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc()
|
||||
|
||||
def update_attribute(self, attribute: str, value: any):
|
||||
if hasattr(self, attribute):
|
||||
setattr(self, attribute, value)
|
||||
# Update string versions for Prometheus labels if needed
|
||||
if attribute == 'specialist_id':
|
||||
self.specialist_id_str = str(value) if value else ""
|
||||
elif attribute == 'specialist_type':
|
||||
self.specialist_type_str = str(value) if value else ""
|
||||
elif attribute == 'specialist_type_version':
|
||||
self.specialist_type_version_str = str(value) if value else ""
|
||||
elif attribute == 'tenant_id':
|
||||
self.tenant_id_str = str(value)
|
||||
else:
|
||||
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{attribute}'")
|
||||
|
||||
@@ -56,6 +157,60 @@ class BusinessEvent:
|
||||
self.llm_metrics['call_count'] += 1
|
||||
self.llm_metrics['interaction_type'] = metrics['interaction_type']
|
||||
|
||||
# Track in Prometheus metrics
|
||||
interaction_type = metrics['interaction_type']
|
||||
|
||||
# Track token usage
|
||||
LLM_TOKENS_COUNTER.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
interaction_type=interaction_type,
|
||||
token_type='total',
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc(metrics['total_tokens'])
|
||||
|
||||
LLM_TOKENS_COUNTER.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
interaction_type=interaction_type,
|
||||
token_type='prompt',
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc(metrics['prompt_tokens'])
|
||||
|
||||
LLM_TOKENS_COUNTER.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
interaction_type=interaction_type,
|
||||
token_type='completion',
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc(metrics['completion_tokens'])
|
||||
|
||||
# Track duration
|
||||
LLM_DURATION.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
interaction_type=interaction_type,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).observe(metrics['time_elapsed'])
|
||||
|
||||
# Track call count
|
||||
LLM_CALLS_COUNTER.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
interaction_type=interaction_type,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc()
|
||||
|
||||
def reset_llm_metrics(self):
|
||||
self.llm_metrics['total_tokens'] = 0
|
||||
self.llm_metrics['prompt_tokens'] = 0
|
||||
@@ -86,6 +241,26 @@ class BusinessEvent:
|
||||
# Track start time for the span
|
||||
span_start_time = time.time()
|
||||
|
||||
# Increment span metrics - using span_name as activity_name for metrics
|
||||
SPAN_COUNTER.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
activity_name=span_name,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc()
|
||||
|
||||
# Increment concurrent spans gauge
|
||||
CONCURRENT_SPANS.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
activity_name=span_name,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc()
|
||||
|
||||
self.log(f"Start")
|
||||
|
||||
try:
|
||||
@@ -94,6 +269,104 @@ class BusinessEvent:
|
||||
# Calculate total time for this span
|
||||
span_total_time = time.time() - span_start_time
|
||||
|
||||
# Observe span duration
|
||||
SPAN_DURATION.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
activity_name=span_name,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).observe(span_total_time)
|
||||
|
||||
# Decrement concurrent spans gauge
|
||||
CONCURRENT_SPANS.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
activity_name=span_name,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).dec()
|
||||
|
||||
if self.llm_metrics['call_count'] > 0:
|
||||
self.log_final_metrics()
|
||||
self.reset_llm_metrics()
|
||||
self.log(f"End", extra_fields={'span_duration': span_total_time})
|
||||
# Restore the previous span info
|
||||
if self.spans:
|
||||
self.span_id, self.span_name, self.parent_span_id = self.spans.pop()
|
||||
else:
|
||||
self.span_id = None
|
||||
self.span_name = None
|
||||
self.parent_span_id = None
|
||||
|
||||
@asynccontextmanager
|
||||
async def create_span_async(self, span_name: str):
|
||||
"""Async version of create_span using async context manager"""
|
||||
parent_span_id = self.span_id
|
||||
self.span_counter += 1
|
||||
new_span_id = str(uuid.uuid4())
|
||||
|
||||
# Save the current span info
|
||||
self.spans.append((self.span_id, self.span_name, self.parent_span_id))
|
||||
|
||||
# Set the new span info
|
||||
self.span_id = new_span_id
|
||||
self.span_name = span_name
|
||||
self.parent_span_id = parent_span_id
|
||||
|
||||
# Track start time for the span
|
||||
span_start_time = time.time()
|
||||
|
||||
# Increment span metrics - using span_name as activity_name for metrics
|
||||
SPAN_COUNTER.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
activity_name=span_name,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc()
|
||||
|
||||
# Increment concurrent spans gauge
|
||||
CONCURRENT_SPANS.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
activity_name=span_name,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).inc()
|
||||
|
||||
self.log(f"Start")
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
# Calculate total time for this span
|
||||
span_total_time = time.time() - span_start_time
|
||||
|
||||
# Observe span duration
|
||||
SPAN_DURATION.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
activity_name=span_name,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).observe(span_total_time)
|
||||
|
||||
# Decrement concurrent spans gauge
|
||||
CONCURRENT_SPANS.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
activity_name=span_name,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).dec()
|
||||
|
||||
if self.llm_metrics['call_count'] > 0:
|
||||
self.log_final_metrics()
|
||||
self.reset_llm_metrics()
|
||||
@@ -119,6 +392,9 @@ class BusinessEvent:
|
||||
'document_version_file_size': self.document_version_file_size,
|
||||
'chat_session_id': self.chat_session_id,
|
||||
'interaction_id': self.interaction_id,
|
||||
'specialist_id': self.specialist_id,
|
||||
'specialist_type': self.specialist_type,
|
||||
'specialist_type_version': self.specialist_type_version,
|
||||
'environment': self.environment,
|
||||
'message': message,
|
||||
}
|
||||
@@ -149,6 +425,9 @@ class BusinessEvent:
|
||||
'document_version_file_size': self.document_version_file_size,
|
||||
'chat_session_id': self.chat_session_id,
|
||||
'interaction_id': self.interaction_id,
|
||||
'specialist_id': self.specialist_id,
|
||||
'specialist_type': self.specialist_type,
|
||||
'specialist_type_version': self.specialist_type_version,
|
||||
'environment': self.environment,
|
||||
'llm_metrics_total_tokens': metrics['total_tokens'],
|
||||
'llm_metrics_prompt_tokens': metrics['prompt_tokens'],
|
||||
@@ -174,6 +453,9 @@ class BusinessEvent:
|
||||
'document_version_file_size': self.document_version_file_size,
|
||||
'chat_session_id': self.chat_session_id,
|
||||
'interaction_id': self.interaction_id,
|
||||
'specialist_id': self.specialist_id,
|
||||
'specialist_type': self.specialist_type,
|
||||
'specialist_type_version': self.specialist_type_version,
|
||||
'environment': self.environment,
|
||||
'llm_metrics_total_tokens': self.llm_metrics['total_tokens'],
|
||||
'llm_metrics_prompt_tokens': self.llm_metrics['prompt_tokens'],
|
||||
@@ -203,6 +485,9 @@ class BusinessEvent:
|
||||
document_version_file_size=entry.pop('document_version_file_size', None),
|
||||
chat_session_id=entry.pop('chat_session_id', None),
|
||||
interaction_id=entry.pop('interaction_id', None),
|
||||
specialist_id=entry.pop('specialist_id', None),
|
||||
specialist_type=entry.pop('specialist_type', None),
|
||||
specialist_type_version=entry.pop('specialist_type_version', None),
|
||||
environment=entry.pop('environment', None),
|
||||
llm_metrics_total_tokens=entry.pop('llm_metrics_total_tokens', None),
|
||||
llm_metrics_prompt_tokens=entry.pop('llm_metrics_prompt_tokens', None),
|
||||
@@ -249,6 +534,24 @@ class BusinessEvent:
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
trace_total_time = time.time() - self.trace_start_time
|
||||
|
||||
# Record trace duration
|
||||
TRACE_DURATION.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).observe(trace_total_time)
|
||||
|
||||
# Decrement concurrent traces gauge
|
||||
CONCURRENT_TRACES.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).dec()
|
||||
|
||||
if self.llm_metrics['call_count'] > 0:
|
||||
self.log_final_metrics()
|
||||
self.reset_llm_metrics()
|
||||
@@ -256,3 +559,37 @@ class BusinessEvent:
|
||||
self.log(f'Ending Trace for {self.event_type}', extra_fields={'trace_duration': trace_total_time})
|
||||
self._flush_log_buffer()
|
||||
return BusinessEventContext(self).__exit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
async def __aenter__(self):
|
||||
self.trace_start_time = time.time()
|
||||
self.log(f'Starting Trace for {self.event_type}')
|
||||
return await BusinessEventContext(self).__aenter__()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
trace_total_time = time.time() - self.trace_start_time
|
||||
|
||||
# Record trace duration
|
||||
TRACE_DURATION.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).observe(trace_total_time)
|
||||
|
||||
# Decrement concurrent traces gauge
|
||||
CONCURRENT_TRACES.labels(
|
||||
tenant_id=self.tenant_id_str,
|
||||
event_type=self.event_type,
|
||||
specialist_id=self.specialist_id_str,
|
||||
specialist_type=self.specialist_type_str,
|
||||
specialist_type_version=self.specialist_type_version_str
|
||||
).dec()
|
||||
|
||||
if self.llm_metrics['call_count'] > 0:
|
||||
self.log_final_metrics()
|
||||
self.reset_llm_metrics()
|
||||
|
||||
self.log(f'Ending Trace for {self.event_type}', extra_fields={'trace_duration': trace_total_time})
|
||||
self._flush_log_buffer()
|
||||
return await BusinessEventContext(self).__aexit__(exc_type, exc_val, exc_tb)
|
||||
@@ -1,9 +1,22 @@
|
||||
from werkzeug.local import LocalProxy, LocalStack
|
||||
import asyncio
|
||||
from contextvars import ContextVar
|
||||
import contextvars
|
||||
|
||||
# Keep existing stack for backward compatibility
|
||||
_business_event_stack = LocalStack()
|
||||
|
||||
# Add contextvar for async support
|
||||
_business_event_contextvar = ContextVar('business_event', default=None)
|
||||
|
||||
|
||||
def _get_current_event():
|
||||
# Try contextvar first (for async)
|
||||
event = _business_event_contextvar.get()
|
||||
if event is not None:
|
||||
return event
|
||||
|
||||
# Fall back to the stack-based approach (for sync)
|
||||
top = _business_event_stack.top
|
||||
if top is None:
|
||||
raise RuntimeError("No business event context found. Are you sure you're in a business event?")
|
||||
@@ -16,10 +29,24 @@ current_event = LocalProxy(_get_current_event)
|
||||
class BusinessEventContext:
|
||||
def __init__(self, event):
|
||||
self.event = event
|
||||
self._token = None # For storing contextvar token
|
||||
|
||||
def __enter__(self):
|
||||
_business_event_stack.push(self.event)
|
||||
self._token = _business_event_contextvar.set(self.event)
|
||||
return self.event
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
_business_event_stack.pop()
|
||||
if self._token is not None:
|
||||
_business_event_contextvar.reset(self._token)
|
||||
|
||||
async def __aenter__(self):
|
||||
_business_event_stack.push(self.event)
|
||||
self._token = _business_event_contextvar.set(self.event)
|
||||
return self.event
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
_business_event_stack.pop()
|
||||
if self._token is not None:
|
||||
_business_event_contextvar.reset(self._token)
|
||||
6
common/utils/cache/base.py
vendored
6
common/utils/cache/base.py
vendored
@@ -59,7 +59,7 @@ class CacheHandler(Generic[T]):
|
||||
Returns:
|
||||
A serializable format of the instance.
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def _from_cache_data(self, data: Any, **kwargs) -> T:
|
||||
@@ -73,7 +73,7 @@ class CacheHandler(Generic[T]):
|
||||
Returns:
|
||||
The data in its usable format.
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def _should_cache(self, value: T) -> bool:
|
||||
@@ -86,7 +86,7 @@ class CacheHandler(Generic[T]):
|
||||
Returns:
|
||||
True if the value should be cached, False otherwise.
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def configure_keys(self, *components: str):
|
||||
"""
|
||||
|
||||
59
common/utils/performance_monitoring.py
Normal file
59
common/utils/performance_monitoring.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import time
|
||||
import threading
|
||||
from contextlib import contextmanager
|
||||
from functools import wraps
|
||||
from prometheus_client import Counter, Histogram, Summary, start_http_server, Gauge
|
||||
from flask import current_app, g, request, Flask
|
||||
|
||||
|
||||
class EveAIMetrics:
|
||||
"""
|
||||
Central class for Prometheus metrics infrastructure.
|
||||
This class initializes the Prometheus HTTP server and provides
|
||||
shared functionality for metrics across components.
|
||||
|
||||
Component-specific metrics should be defined in their respective modules.
|
||||
"""
|
||||
|
||||
def __init__(self, app: Flask = None):
|
||||
self.app = app
|
||||
self._metrics_server_started = False
|
||||
if app is not None:
|
||||
self.init_app(app)
|
||||
|
||||
def init_app(self, app: Flask):
|
||||
"""Initialize metrics with Flask app and start Prometheus server"""
|
||||
self.app = app
|
||||
self._start_metrics_server()
|
||||
|
||||
def _start_metrics_server(self):
|
||||
"""Start the Prometheus metrics HTTP server if not already running"""
|
||||
if not self._metrics_server_started:
|
||||
try:
|
||||
metrics_port = self.app.config.get('PROMETHEUS_PORT', 8000)
|
||||
start_http_server(metrics_port)
|
||||
self.app.logger.info(f"Prometheus metrics server started on port {metrics_port}")
|
||||
self._metrics_server_started = True
|
||||
except Exception as e:
|
||||
self.app.logger.error(f"Failed to start metrics server: {e}")
|
||||
|
||||
@staticmethod
|
||||
def get_standard_buckets():
|
||||
"""
|
||||
Return the standard duration buckets for histogram metrics.
|
||||
Components should use these for consistency across the system.
|
||||
"""
|
||||
return [0.1, 0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 240, 360, float('inf')]
|
||||
|
||||
@staticmethod
|
||||
def sanitize_label_values(labels_dict):
|
||||
"""
|
||||
Convert all label values to strings as required by Prometheus.
|
||||
|
||||
Args:
|
||||
labels_dict: Dictionary of label name to label value
|
||||
|
||||
Returns:
|
||||
Dictionary with all values converted to strings
|
||||
"""
|
||||
return {k: str(v) if v is not None else "" for k, v in labels_dict.items()}
|
||||
Reference in New Issue
Block a user