From fa452e4934297649921b4a7330328d7827c447de Mon Sep 17 00:00:00 2001 From: Josako Date: Tue, 30 Sep 2025 14:56:08 +0200 Subject: [PATCH 1/2] - Change manifests for Prometheus installation - Change instructions for deploying Prometheus stack and Pushgateway - Additional grouping to pushgateway to avoid overwriting of metrics in different pods / processes - Bugfix to ensure good retrieval of css en js files in eveai_app --- common/utils/business_event.py | 16 +++- common/utils/template_filters.py | 39 +++++---- config/config.py | 4 + config/static-manifest/manifest.json | 4 +- documentation/PUSHGATEWAY_GROUPING.md | 79 +++++++++++++++++++ .../Production Setup/cluster-install.md | 5 +- .../eveai-chat-workers/deployment.yaml | 8 ++ .../eveai-entitlements/deployment.yaml | 8 ++ .../backend/eveai-workers/deployment.yaml | 8 ++ .../frontend/eveai-api/deployment.yaml | 8 ++ .../frontend/eveai-app/deployment.yaml | 8 ++ .../eveai-chat-client/deployment.yaml | 8 ++ .../base/monitoring/prometheus-values.yaml | 71 ----------------- .../base/monitoring/values-monitoring.yaml | 22 +----- 14 files changed, 179 insertions(+), 109 deletions(-) create mode 100644 documentation/PUSHGATEWAY_GROUPING.md delete mode 100644 scaleway/manifests/base/monitoring/prometheus-values.yaml diff --git a/common/utils/business_event.py b/common/utils/business_event.py index a505a89..2d42329 100644 --- a/common/utils/business_event.py +++ b/common/utils/business_event.py @@ -559,12 +559,24 @@ class BusinessEvent: self._log_buffer = [] def _push_to_gateway(self): - # Push metrics to the gateway + # Push metrics to the gateway with grouping key to avoid overwrites across pods/processes try: + # Determine grouping labels + pod_name = current_app.config.get('POD_NAME', current_app.config.get('COMPONENT_NAME', 'dev')) + pod_namespace = current_app.config.get('POD_NAMESPACE', current_app.config.get('FLASK_ENV', 'dev')) + worker_id = str(os.getpid()) + + grouping_key = { + 'instance': pod_name, + 'namespace': pod_namespace, + 'process': worker_id, + } + push_to_gateway( current_app.config['PUSH_GATEWAY_URL'], job=current_app.config['COMPONENT_NAME'], - registry=REGISTRY + registry=REGISTRY, + grouping_key=grouping_key, ) except Exception as e: current_app.logger.error(f"Failed to push metrics to Prometheus Push Gateway: {e}") diff --git a/common/utils/template_filters.py b/common/utils/template_filters.py index 8eff017..ca7106a 100644 --- a/common/utils/template_filters.py +++ b/common/utils/template_filters.py @@ -110,28 +110,39 @@ def get_pagination_html(pagination, endpoint, **kwargs): def asset_url(logical_path: str): """ Resolve an asset logical path to a hashed URL using Parcel manifest when available. - Fallback to the original logical path under /static/ if manifest is missing. + Return a URL that respects STATIC_URL (CDN) when configured; otherwise serve from /static/. Examples: - - asset_url('dist/chat-client.js') -> '/static/dist/chat-client.abc123.js' - - asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css' + - asset_url('dist/chat-client.js') -> 'https://cdn/.../dist/chat-client.abc123.js' (when STATIC_URL set) + - asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css' (when STATIC_URL not set) """ if not logical_path: return logical_path try: from common.utils.asset_manifest import resolve_asset - resolved = resolve_asset(logical_path) - if not resolved: - return f"/static/{logical_path.lstrip('/')}" - # If resolved is already an absolute URL starting with /static or http(s), return as is - if resolved.startswith('/static/') or resolved.startswith('http://') or resolved.startswith('https://'): + # Resolve logical to possibly hashed path + resolved = resolve_asset(logical_path) or logical_path + + # If manifest returns an absolute URL, return as-is + if resolved.startswith('http://') or resolved.startswith('https://'): return resolved - # If it starts with 'dist/', prefix /static/ - if resolved.startswith('dist/'): - return '/static/' + resolved - # Otherwise, best effort: ensure it lives under /static/ - return '/static/' + resolved.lstrip('/') + + # Normalize: strip any leading '/static/' and leading '/' + if resolved.startswith('/static/'): + rel = resolved[len('/static/'):] + else: + rel = resolved.lstrip('/') + + # Build with STATIC_URL if configured + static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/') + if static_base: + return f"{static_base}/{rel}" + # Fallback to app static + return f"/static/{rel}" except Exception: - return f"/static/{logical_path.lstrip('/')}" + # Conservative fallback also respecting STATIC_URL + static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/') + rel = logical_path.lstrip('/') + return f"{static_base}/{rel}" if static_base else f"/static/{rel}" def register_filters(app): diff --git a/config/config.py b/config/config.py index a384a96..f3d9737 100644 --- a/config/config.py +++ b/config/config.py @@ -439,6 +439,10 @@ class StagingConfig(Config): MINIO_SECRET_KEY = environ.get('MINIO_SECRET_KEY') MINIO_USE_HTTPS = True + # Push gateway grouping elements + pod_name = os.getenv('POD_NAME') + pod_namespace = os.getenv('POD_NAMESPACE') + class ProdConfig(Config): DEVELOPMENT = False diff --git a/config/static-manifest/manifest.json b/config/static-manifest/manifest.json index a28e9ca..e401916 100644 --- a/config/static-manifest/manifest.json +++ b/config/static-manifest/manifest.json @@ -1,6 +1,6 @@ { - "dist/chat-client.js": "dist/chat-client.25888758.js", - "dist/chat-client.css": "dist/chat-client.eef0ef31.css", + "dist/chat-client.js": "dist/chat-client.6bfbd765.js", + "dist/chat-client.css": "dist/chat-client.33f904ba.css", "dist/main.js": "dist/main.f3dde0f6.js", "dist/main.css": "dist/main.c40e57ad.css" } \ No newline at end of file diff --git a/documentation/PUSHGATEWAY_GROUPING.md b/documentation/PUSHGATEWAY_GROUPING.md new file mode 100644 index 0000000..515f2df --- /dev/null +++ b/documentation/PUSHGATEWAY_GROUPING.md @@ -0,0 +1,79 @@ +# Pushgateway Grouping Keys (instance, namespace, process) + +Goal: prevent metrics pushed by different Pods or worker processes from overwriting each other, while keeping Prometheus/Grafana queries simple. + +Summary of decisions +- WORKER_ID source = OS process ID (PID) +- Always include namespace in grouping labels + +What this changes +- Every push to Prometheus Pushgateway now includes a grouping_key with: + - instance = POD_NAME (fallback to HOSTNAME, then "dev") + - namespace = POD_NAMESPACE (fallback to ENVIRONMENT, then "dev") + - process = WORKER_ID (fallback to current PID) +- Prometheus will expose these as exported_instance, exported_namespace, and exported_process on the scraped series. + +Code changes (already implemented) +- common/utils/business_event.py + - push_to_gateway(..., grouping_key={instance, namespace, process}) + - Safe fallbacks ensure dev/test (Podman) keeps working with no K8s-specific env vars. + +Kubernetes manifests (already implemented) +- All Deployments that push metrics set env vars via Downward API: + - POD_NAME from metadata.name + - POD_NAMESPACE from metadata.namespace +- Files updated: + - scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml + - scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml + - scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml + - scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml + - scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml + - scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml + +No changes needed to secrets +- PUSH_GATEWAY_HOST/PORT remain provided via eveai-secrets; code composes PUSH_GATEWAY_URL internally. + +How to verify +1) Pushgateway contains per-pod/process groups + - Port-forward Pushgateway (namespace monitoring): + - kubectl -n monitoring port-forward svc/monitoring-pushgateway-prometheus-pushgateway 9091:9091 + - Inspect: + - curl -s http://127.0.0.1:9091/api/v1/metrics | jq '.[].labels' + - You should see labels including job (your service), instance (pod), namespace, process (pid). + +2) Prometheus shows the labels as exported_* + - Port-forward Prometheus (namespace monitoring): + - kubectl -n monitoring port-forward svc/monitoring-prometheus 9090:9090 + - Queries: + - label_values(eveai_llm_calls_total, exported_instance) + - label_values(eveai_llm_calls_total, exported_namespace) + - label_values(eveai_llm_calls_total, exported_process) + +PromQL query patterns +- Hide per-process by aggregating away exported_process: + - sum without(exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_instance, exported_namespace) +- Service-level totals (hide instance and process): + - sum without(exported_instance, exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_namespace) +- Histogram example (p95 per service): + - histogram_quantile(0.95, sum without(exported_process) (rate(eveai_llm_duration_seconds_bucket[5m])) by (le, exported_job, exported_namespace)) + +Dev/Test (Podman) behavior +- No Kubernetes Downward API: POD_NAME/POD_NAMESPACE are not set. +- Fallbacks used by the code: + - instance = HOSTNAME if available, else "dev" + - namespace = ENVIRONMENT if available, else "dev" + - process = current PID +- This guarantees no crashes and still avoids process-level overwrites. + +Operational notes +- Cardinality: adding process creates more series (one per worker). This is required to avoid data loss when multiple workers push concurrently. Dashboards should aggregate away exported_process unless you need per-worker detail. +- Batch jobs (future): use the same grouping and consider delete_from_gateway on successful completion to remove stale groups for that job/instance/process. + +Troubleshooting +- If you still see overwriting: + - Confirm that instance, namespace, and process all appear in Pushgateway JSON labels for each group. + - Ensure that all pods set POD_NAME and POD_NAMESPACE (kubectl -n eveai-staging exec -- env | egrep "POD_NAME|POD_NAMESPACE"). + - Verify that your app processes run push_to_gateway through the shared business_event wrapper. + +Change log reference +- Implemented on 2025-09-26 by adding grouping_key in business_event push and env vars in Deployments. diff --git a/documentation/Production Setup/cluster-install.md b/documentation/Production Setup/cluster-install.md index 97974ad..7186662 100644 --- a/documentation/Production Setup/cluster-install.md +++ b/documentation/Production Setup/cluster-install.md @@ -119,7 +119,7 @@ helm search repo prometheus-community/kube-prometheus-stack #### Create Monitoring Values File -Create `scaleway/manifests/base/monitoring/prometheus-values.yaml`: +Create `scaleway/manifests/base/monitoring/values-monitoring.yaml`: #### Deploy Monitoring Stack @@ -133,7 +133,8 @@ helm install monitoring prometheus-community/kube-prometheus-stack \ # Install pushgateway helm install monitoring-pushgateway prometheus-community/prometheus-pushgateway \ -n monitoring --create-namespace \ - --set serviceMonitor.enabled=true + --set serviceMonitor.enabled=true \ + --set serviceMonitor.additionalLabels.release=monitoring # Monitor deployment progress kubectl get pods -n monitoring -w diff --git a/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml b/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml index 26b9bcf..73a2c4e 100644 --- a/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml +++ b/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml @@ -54,6 +54,14 @@ spec: name: eveai-secrets key: PUSH_GATEWAY_PORT optional: true + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace volumeMounts: - name: logs-volume mountPath: /app/logs diff --git a/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml b/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml index 1b6a6cc..93cea7b 100644 --- a/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml +++ b/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml @@ -54,6 +54,14 @@ spec: name: eveai-secrets key: PUSH_GATEWAY_PORT optional: true + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace resources: requests: cpu: "100m" diff --git a/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml b/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml index 7c220c1..016e00e 100644 --- a/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml +++ b/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml @@ -54,6 +54,14 @@ spec: name: eveai-secrets key: PUSH_GATEWAY_PORT optional: true + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace resources: requests: cpu: "200m" diff --git a/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml b/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml index 2b785ad..431bfe7 100644 --- a/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml +++ b/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml @@ -49,6 +49,14 @@ spec: name: eveai-secrets key: PUSH_GATEWAY_PORT optional: true + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace resources: requests: cpu: "150m" diff --git a/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml b/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml index 7c0e414..e35e13a 100644 --- a/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml +++ b/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml @@ -49,6 +49,14 @@ spec: name: eveai-secrets key: PUSH_GATEWAY_PORT optional: true + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace resources: requests: cpu: "150m" diff --git a/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml b/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml index 7103b92..26c64eb 100644 --- a/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml +++ b/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml @@ -49,6 +49,14 @@ spec: name: eveai-secrets key: PUSH_GATEWAY_PORT optional: true + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace resources: requests: cpu: "150m" diff --git a/scaleway/manifests/base/monitoring/prometheus-values.yaml b/scaleway/manifests/base/monitoring/prometheus-values.yaml deleted file mode 100644 index 03d393f..0000000 --- a/scaleway/manifests/base/monitoring/prometheus-values.yaml +++ /dev/null @@ -1,71 +0,0 @@ -# prometheus-values.yaml -# Global settings -fullnameOverride: "monitoring" - -# Prometheus configuration -prometheus: - prometheusSpec: - retention: 15d - resources: - limits: - cpu: 500m - memory: 2Gi - requests: - cpu: 100m - memory: 512Mi - storageSpec: - volumeClaimTemplate: - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 10Gi - -# Grafana configuration -grafana: - enabled: true - adminPassword: "admin123" # Change this for production - resources: - limits: - cpu: 200m - memory: 256Mi - requests: - cpu: 50m - memory: 128Mi - persistence: - enabled: true - size: 2Gi - -# AlertManager configuration -alertmanager: - alertmanagerSpec: - resources: - limits: - cpu: 100m - memory: 256Mi - requests: - cpu: 10m - memory: 64Mi - storage: - volumeClaimTemplate: - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 2Gi - -# Node Exporter -nodeExporter: - enabled: true - -# Kube State Metrics -kubeStateMetrics: - enabled: true - -# Disable components you might not need in staging -kubeEtcd: - enabled: false -kubeScheduler: - enabled: false -kubeControllerManager: - enabled: false \ No newline at end of file diff --git a/scaleway/manifests/base/monitoring/values-monitoring.yaml b/scaleway/manifests/base/monitoring/values-monitoring.yaml index 2bbd408..9e3c704 100644 --- a/scaleway/manifests/base/monitoring/values-monitoring.yaml +++ b/scaleway/manifests/base/monitoring/values-monitoring.yaml @@ -9,7 +9,7 @@ global: # Prometheus configuration prometheus: prometheusSpec: - retention: 30d + retention: 7d storageSpec: volumeClaimTemplate: spec: @@ -17,21 +17,7 @@ prometheus: accessModes: ["ReadWriteOnce"] resources: requests: - storage: 50Gi - - # External services monitoring (Scaleway managed services) - additionalScrapeConfigs: - - job_name: 'scaleway-redis' - static_configs: - - targets: ['redis-endpoint:6379'] - metrics_path: /metrics - scrape_interval: 30s - - - job_name: 'scaleway-postgresql' - static_configs: - - targets: ['postgres-endpoint:5432'] - metrics_path: /metrics - scrape_interval: 30s + storage: 5Gi # Resource limits resources: @@ -48,7 +34,7 @@ grafana: persistence: enabled: true storageClassName: scw-bssd - size: 10Gi + size: 2Gi # Resource limits resources: @@ -97,7 +83,7 @@ alertmanager: accessModes: ["ReadWriteOnce"] resources: requests: - storage: 10Gi + storage: 1Gi resources: requests: memory: 128Mi From 030d1b0e9078b0b3a80097117e7cceed4468c21c Mon Sep 17 00:00:00 2001 From: Josako Date: Tue, 30 Sep 2025 14:58:08 +0200 Subject: [PATCH 2/2] - cleaning script for monitoring namespace --- scaleway/clean-monitoring.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100755 scaleway/clean-monitoring.sh diff --git a/scaleway/clean-monitoring.sh b/scaleway/clean-monitoring.sh new file mode 100755 index 0000000..2546621 --- /dev/null +++ b/scaleway/clean-monitoring.sh @@ -0,0 +1,25 @@ +# 1. Verwijder alle ClusterRoles van vorige installatie +kubectl delete clusterrole monitoring-grafana-clusterrole --ignore-not-found=true +kubectl delete clusterrole monitoring-kube-prometheus-admission --ignore-not-found=true +kubectl delete clusterrole monitoring-kube-prometheus-operator --ignore-not-found=true +kubectl delete clusterrole monitoring-kube-prometheus-prometheus --ignore-not-found=true +kubectl delete clusterrole monitoring-kube-state-metrics --ignore-not-found=true + +# 2. Verwijder ClusterRoleBindings +kubectl delete clusterrolebinding monitoring-grafana-clusterrolebinding --ignore-not-found=true +kubectl delete clusterrolebinding monitoring-kube-prometheus-admission --ignore-not-found=true +kubectl delete clusterrolebinding monitoring-kube-prometheus-operator --ignore-not-found=true +kubectl delete clusterrolebinding monitoring-kube-prometheus-prometheus --ignore-not-found=true +kubectl delete clusterrolebinding monitoring-kube-state-metrics --ignore-not-found=true + +# 3. Verwijder eventuele webhook configurations +kubectl delete mutatingwebhookconfiguration monitoring-kube-prometheus-admission --ignore-not-found=true +kubectl delete validatingwebhookconfiguration monitoring-kube-prometheus-admission --ignore-not-found=true + +# 4. Check voor andere monitoring resources +kubectl get clusterroles | grep monitoring +kubectl get clusterrolebindings | grep monitoring + +# 5. Als er nog resources zijn, verwijder ze: +kubectl get clusterroles | grep monitoring | awk '{print $1}' | xargs -r kubectl delete clusterrole +kubectl get clusterrolebindings | grep monitoring | awk '{print $1}' | xargs -r kubectl delete clusterrolebinding