Merge branch 'feature/Activate_Pushgateway_Scraping' into develop
This commit is contained in:
@@ -559,12 +559,24 @@ class BusinessEvent:
|
||||
self._log_buffer = []
|
||||
|
||||
def _push_to_gateway(self):
|
||||
# Push metrics to the gateway
|
||||
# Push metrics to the gateway with grouping key to avoid overwrites across pods/processes
|
||||
try:
|
||||
# Determine grouping labels
|
||||
pod_name = current_app.config.get('POD_NAME', current_app.config.get('COMPONENT_NAME', 'dev'))
|
||||
pod_namespace = current_app.config.get('POD_NAMESPACE', current_app.config.get('FLASK_ENV', 'dev'))
|
||||
worker_id = str(os.getpid())
|
||||
|
||||
grouping_key = {
|
||||
'instance': pod_name,
|
||||
'namespace': pod_namespace,
|
||||
'process': worker_id,
|
||||
}
|
||||
|
||||
push_to_gateway(
|
||||
current_app.config['PUSH_GATEWAY_URL'],
|
||||
job=current_app.config['COMPONENT_NAME'],
|
||||
registry=REGISTRY
|
||||
registry=REGISTRY,
|
||||
grouping_key=grouping_key,
|
||||
)
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Failed to push metrics to Prometheus Push Gateway: {e}")
|
||||
|
||||
@@ -110,28 +110,39 @@ def get_pagination_html(pagination, endpoint, **kwargs):
|
||||
def asset_url(logical_path: str):
|
||||
"""
|
||||
Resolve an asset logical path to a hashed URL using Parcel manifest when available.
|
||||
Fallback to the original logical path under /static/ if manifest is missing.
|
||||
Return a URL that respects STATIC_URL (CDN) when configured; otherwise serve from /static/.
|
||||
Examples:
|
||||
- asset_url('dist/chat-client.js') -> '/static/dist/chat-client.abc123.js'
|
||||
- asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css'
|
||||
- asset_url('dist/chat-client.js') -> 'https://cdn/.../dist/chat-client.abc123.js' (when STATIC_URL set)
|
||||
- asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css' (when STATIC_URL not set)
|
||||
"""
|
||||
if not logical_path:
|
||||
return logical_path
|
||||
try:
|
||||
from common.utils.asset_manifest import resolve_asset
|
||||
resolved = resolve_asset(logical_path)
|
||||
if not resolved:
|
||||
return f"/static/{logical_path.lstrip('/')}"
|
||||
# If resolved is already an absolute URL starting with /static or http(s), return as is
|
||||
if resolved.startswith('/static/') or resolved.startswith('http://') or resolved.startswith('https://'):
|
||||
# Resolve logical to possibly hashed path
|
||||
resolved = resolve_asset(logical_path) or logical_path
|
||||
|
||||
# If manifest returns an absolute URL, return as-is
|
||||
if resolved.startswith('http://') or resolved.startswith('https://'):
|
||||
return resolved
|
||||
# If it starts with 'dist/', prefix /static/
|
||||
if resolved.startswith('dist/'):
|
||||
return '/static/' + resolved
|
||||
# Otherwise, best effort: ensure it lives under /static/
|
||||
return '/static/' + resolved.lstrip('/')
|
||||
|
||||
# Normalize: strip any leading '/static/' and leading '/'
|
||||
if resolved.startswith('/static/'):
|
||||
rel = resolved[len('/static/'):]
|
||||
else:
|
||||
rel = resolved.lstrip('/')
|
||||
|
||||
# Build with STATIC_URL if configured
|
||||
static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/')
|
||||
if static_base:
|
||||
return f"{static_base}/{rel}"
|
||||
# Fallback to app static
|
||||
return f"/static/{rel}"
|
||||
except Exception:
|
||||
return f"/static/{logical_path.lstrip('/')}"
|
||||
# Conservative fallback also respecting STATIC_URL
|
||||
static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/')
|
||||
rel = logical_path.lstrip('/')
|
||||
return f"{static_base}/{rel}" if static_base else f"/static/{rel}"
|
||||
|
||||
|
||||
def register_filters(app):
|
||||
|
||||
@@ -439,6 +439,10 @@ class StagingConfig(Config):
|
||||
MINIO_SECRET_KEY = environ.get('MINIO_SECRET_KEY')
|
||||
MINIO_USE_HTTPS = True
|
||||
|
||||
# Push gateway grouping elements
|
||||
pod_name = os.getenv('POD_NAME')
|
||||
pod_namespace = os.getenv('POD_NAMESPACE')
|
||||
|
||||
|
||||
class ProdConfig(Config):
|
||||
DEVELOPMENT = False
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"dist/chat-client.js": "dist/chat-client.25888758.js",
|
||||
"dist/chat-client.css": "dist/chat-client.eef0ef31.css",
|
||||
"dist/chat-client.js": "dist/chat-client.6bfbd765.js",
|
||||
"dist/chat-client.css": "dist/chat-client.33f904ba.css",
|
||||
"dist/main.js": "dist/main.f3dde0f6.js",
|
||||
"dist/main.css": "dist/main.c40e57ad.css"
|
||||
}
|
||||
79
documentation/PUSHGATEWAY_GROUPING.md
Normal file
79
documentation/PUSHGATEWAY_GROUPING.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Pushgateway Grouping Keys (instance, namespace, process)
|
||||
|
||||
Goal: prevent metrics pushed by different Pods or worker processes from overwriting each other, while keeping Prometheus/Grafana queries simple.
|
||||
|
||||
Summary of decisions
|
||||
- WORKER_ID source = OS process ID (PID)
|
||||
- Always include namespace in grouping labels
|
||||
|
||||
What this changes
|
||||
- Every push to Prometheus Pushgateway now includes a grouping_key with:
|
||||
- instance = POD_NAME (fallback to HOSTNAME, then "dev")
|
||||
- namespace = POD_NAMESPACE (fallback to ENVIRONMENT, then "dev")
|
||||
- process = WORKER_ID (fallback to current PID)
|
||||
- Prometheus will expose these as exported_instance, exported_namespace, and exported_process on the scraped series.
|
||||
|
||||
Code changes (already implemented)
|
||||
- common/utils/business_event.py
|
||||
- push_to_gateway(..., grouping_key={instance, namespace, process})
|
||||
- Safe fallbacks ensure dev/test (Podman) keeps working with no K8s-specific env vars.
|
||||
|
||||
Kubernetes manifests (already implemented)
|
||||
- All Deployments that push metrics set env vars via Downward API:
|
||||
- POD_NAME from metadata.name
|
||||
- POD_NAMESPACE from metadata.namespace
|
||||
- Files updated:
|
||||
- scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
|
||||
- scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
|
||||
- scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
|
||||
- scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
|
||||
- scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
|
||||
- scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
|
||||
|
||||
No changes needed to secrets
|
||||
- PUSH_GATEWAY_HOST/PORT remain provided via eveai-secrets; code composes PUSH_GATEWAY_URL internally.
|
||||
|
||||
How to verify
|
||||
1) Pushgateway contains per-pod/process groups
|
||||
- Port-forward Pushgateway (namespace monitoring):
|
||||
- kubectl -n monitoring port-forward svc/monitoring-pushgateway-prometheus-pushgateway 9091:9091
|
||||
- Inspect:
|
||||
- curl -s http://127.0.0.1:9091/api/v1/metrics | jq '.[].labels'
|
||||
- You should see labels including job (your service), instance (pod), namespace, process (pid).
|
||||
|
||||
2) Prometheus shows the labels as exported_*
|
||||
- Port-forward Prometheus (namespace monitoring):
|
||||
- kubectl -n monitoring port-forward svc/monitoring-prometheus 9090:9090
|
||||
- Queries:
|
||||
- label_values(eveai_llm_calls_total, exported_instance)
|
||||
- label_values(eveai_llm_calls_total, exported_namespace)
|
||||
- label_values(eveai_llm_calls_total, exported_process)
|
||||
|
||||
PromQL query patterns
|
||||
- Hide per-process by aggregating away exported_process:
|
||||
- sum without(exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_instance, exported_namespace)
|
||||
- Service-level totals (hide instance and process):
|
||||
- sum without(exported_instance, exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_namespace)
|
||||
- Histogram example (p95 per service):
|
||||
- histogram_quantile(0.95, sum without(exported_process) (rate(eveai_llm_duration_seconds_bucket[5m])) by (le, exported_job, exported_namespace))
|
||||
|
||||
Dev/Test (Podman) behavior
|
||||
- No Kubernetes Downward API: POD_NAME/POD_NAMESPACE are not set.
|
||||
- Fallbacks used by the code:
|
||||
- instance = HOSTNAME if available, else "dev"
|
||||
- namespace = ENVIRONMENT if available, else "dev"
|
||||
- process = current PID
|
||||
- This guarantees no crashes and still avoids process-level overwrites.
|
||||
|
||||
Operational notes
|
||||
- Cardinality: adding process creates more series (one per worker). This is required to avoid data loss when multiple workers push concurrently. Dashboards should aggregate away exported_process unless you need per-worker detail.
|
||||
- Batch jobs (future): use the same grouping and consider delete_from_gateway on successful completion to remove stale groups for that job/instance/process.
|
||||
|
||||
Troubleshooting
|
||||
- If you still see overwriting:
|
||||
- Confirm that instance, namespace, and process all appear in Pushgateway JSON labels for each group.
|
||||
- Ensure that all pods set POD_NAME and POD_NAMESPACE (kubectl -n eveai-staging exec <pod> -- env | egrep "POD_NAME|POD_NAMESPACE").
|
||||
- Verify that your app processes run push_to_gateway through the shared business_event wrapper.
|
||||
|
||||
Change log reference
|
||||
- Implemented on 2025-09-26 by adding grouping_key in business_event push and env vars in Deployments.
|
||||
@@ -119,7 +119,7 @@ helm search repo prometheus-community/kube-prometheus-stack
|
||||
|
||||
#### Create Monitoring Values File
|
||||
|
||||
Create `scaleway/manifests/base/monitoring/prometheus-values.yaml`:
|
||||
Create `scaleway/manifests/base/monitoring/values-monitoring.yaml`:
|
||||
|
||||
#### Deploy Monitoring Stack
|
||||
|
||||
@@ -133,7 +133,8 @@ helm install monitoring prometheus-community/kube-prometheus-stack \
|
||||
# Install pushgateway
|
||||
helm install monitoring-pushgateway prometheus-community/prometheus-pushgateway \
|
||||
-n monitoring --create-namespace \
|
||||
--set serviceMonitor.enabled=true
|
||||
--set serviceMonitor.enabled=true \
|
||||
--set serviceMonitor.additionalLabels.release=monitoring
|
||||
|
||||
# Monitor deployment progress
|
||||
kubectl get pods -n monitoring -w
|
||||
|
||||
25
scaleway/clean-monitoring.sh
Executable file
25
scaleway/clean-monitoring.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
# 1. Verwijder alle ClusterRoles van vorige installatie
|
||||
kubectl delete clusterrole monitoring-grafana-clusterrole --ignore-not-found=true
|
||||
kubectl delete clusterrole monitoring-kube-prometheus-admission --ignore-not-found=true
|
||||
kubectl delete clusterrole monitoring-kube-prometheus-operator --ignore-not-found=true
|
||||
kubectl delete clusterrole monitoring-kube-prometheus-prometheus --ignore-not-found=true
|
||||
kubectl delete clusterrole monitoring-kube-state-metrics --ignore-not-found=true
|
||||
|
||||
# 2. Verwijder ClusterRoleBindings
|
||||
kubectl delete clusterrolebinding monitoring-grafana-clusterrolebinding --ignore-not-found=true
|
||||
kubectl delete clusterrolebinding monitoring-kube-prometheus-admission --ignore-not-found=true
|
||||
kubectl delete clusterrolebinding monitoring-kube-prometheus-operator --ignore-not-found=true
|
||||
kubectl delete clusterrolebinding monitoring-kube-prometheus-prometheus --ignore-not-found=true
|
||||
kubectl delete clusterrolebinding monitoring-kube-state-metrics --ignore-not-found=true
|
||||
|
||||
# 3. Verwijder eventuele webhook configurations
|
||||
kubectl delete mutatingwebhookconfiguration monitoring-kube-prometheus-admission --ignore-not-found=true
|
||||
kubectl delete validatingwebhookconfiguration monitoring-kube-prometheus-admission --ignore-not-found=true
|
||||
|
||||
# 4. Check voor andere monitoring resources
|
||||
kubectl get clusterroles | grep monitoring
|
||||
kubectl get clusterrolebindings | grep monitoring
|
||||
|
||||
# 5. Als er nog resources zijn, verwijder ze:
|
||||
kubectl get clusterroles | grep monitoring | awk '{print $1}' | xargs -r kubectl delete clusterrole
|
||||
kubectl get clusterrolebindings | grep monitoring | awk '{print $1}' | xargs -r kubectl delete clusterrolebinding
|
||||
@@ -54,6 +54,14 @@ spec:
|
||||
name: eveai-secrets
|
||||
key: PUSH_GATEWAY_PORT
|
||||
optional: true
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
volumeMounts:
|
||||
- name: logs-volume
|
||||
mountPath: /app/logs
|
||||
|
||||
@@ -54,6 +54,14 @@ spec:
|
||||
name: eveai-secrets
|
||||
key: PUSH_GATEWAY_PORT
|
||||
optional: true
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
|
||||
@@ -54,6 +54,14 @@ spec:
|
||||
name: eveai-secrets
|
||||
key: PUSH_GATEWAY_PORT
|
||||
optional: true
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
resources:
|
||||
requests:
|
||||
cpu: "200m"
|
||||
|
||||
@@ -49,6 +49,14 @@ spec:
|
||||
name: eveai-secrets
|
||||
key: PUSH_GATEWAY_PORT
|
||||
optional: true
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
resources:
|
||||
requests:
|
||||
cpu: "150m"
|
||||
|
||||
@@ -49,6 +49,14 @@ spec:
|
||||
name: eveai-secrets
|
||||
key: PUSH_GATEWAY_PORT
|
||||
optional: true
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
resources:
|
||||
requests:
|
||||
cpu: "150m"
|
||||
|
||||
@@ -49,6 +49,14 @@ spec:
|
||||
name: eveai-secrets
|
||||
key: PUSH_GATEWAY_PORT
|
||||
optional: true
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
resources:
|
||||
requests:
|
||||
cpu: "150m"
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
# prometheus-values.yaml
|
||||
# Global settings
|
||||
fullnameOverride: "monitoring"
|
||||
|
||||
# Prometheus configuration
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
retention: 15d
|
||||
resources:
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 512Mi
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
|
||||
# Grafana configuration
|
||||
grafana:
|
||||
enabled: true
|
||||
adminPassword: "admin123" # Change this for production
|
||||
resources:
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 2Gi
|
||||
|
||||
# AlertManager configuration
|
||||
alertmanager:
|
||||
alertmanagerSpec:
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 64Mi
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
|
||||
# Node Exporter
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
|
||||
# Kube State Metrics
|
||||
kubeStateMetrics:
|
||||
enabled: true
|
||||
|
||||
# Disable components you might not need in staging
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
@@ -9,7 +9,7 @@ global:
|
||||
# Prometheus configuration
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
retention: 30d
|
||||
retention: 7d
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
@@ -17,21 +17,7 @@ prometheus:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
|
||||
# External services monitoring (Scaleway managed services)
|
||||
additionalScrapeConfigs:
|
||||
- job_name: 'scaleway-redis'
|
||||
static_configs:
|
||||
- targets: ['redis-endpoint:6379']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'scaleway-postgresql'
|
||||
static_configs:
|
||||
- targets: ['postgres-endpoint:5432']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
storage: 5Gi
|
||||
|
||||
# Resource limits
|
||||
resources:
|
||||
@@ -48,7 +34,7 @@ grafana:
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: scw-bssd
|
||||
size: 10Gi
|
||||
size: 2Gi
|
||||
|
||||
# Resource limits
|
||||
resources:
|
||||
@@ -97,7 +83,7 @@ alertmanager:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
storage: 1Gi
|
||||
resources:
|
||||
requests:
|
||||
memory: 128Mi
|
||||
|
||||
Reference in New Issue
Block a user