- Change manifests for Prometheus installation

- Change instructions for deploying Prometheus stack and Pushgateway - Additional grouping to pushgateway to avoid overwriting of metrics in different pods / processes - Bugfix to ensure good retrieval of css en js files in eveai_app
2025-09-30 14:56:08 +02:00
parent a76f87ba75
commit fa452e4934
14 changed files with 179 additions and 109 deletions
--- a/common/utils/business_event.py
+++ b/common/utils/business_event.py
@@ -559,12 +559,24 @@ class BusinessEvent:
            self._log_buffer = []

    def _push_to_gateway(self):
-        # Push metrics to the gateway
+        # Push metrics to the gateway with grouping key to avoid overwrites across pods/processes
        try:
+            # Determine grouping labels
+            pod_name = current_app.config.get('POD_NAME', current_app.config.get('COMPONENT_NAME', 'dev'))
+            pod_namespace = current_app.config.get('POD_NAMESPACE', current_app.config.get('FLASK_ENV', 'dev'))
+            worker_id = str(os.getpid())
+
+            grouping_key = {
+                'instance': pod_name,
+                'namespace': pod_namespace,
+                'process': worker_id,
+            }
+
            push_to_gateway(
                current_app.config['PUSH_GATEWAY_URL'],
                job=current_app.config['COMPONENT_NAME'],
-                registry=REGISTRY
+                registry=REGISTRY,
+                grouping_key=grouping_key,
            )
        except Exception as e:
            current_app.logger.error(f"Failed to push metrics to Prometheus Push Gateway: {e}")
--- a/common/utils/template_filters.py
+++ b/common/utils/template_filters.py
@@ -110,28 +110,39 @@ def get_pagination_html(pagination, endpoint, **kwargs):
 def asset_url(logical_path: str):
    """
    Resolve an asset logical path to a hashed URL using Parcel manifest when available.
-    Fallback to the original logical path under /static/ if manifest is missing.
+    Return a URL that respects STATIC_URL (CDN) when configured; otherwise serve from /static/.
    Examples:
-    - asset_url('dist/chat-client.js') -> '/static/dist/chat-client.abc123.js'
-    - asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css'
+    - asset_url('dist/chat-client.js') -> 'https://cdn/.../dist/chat-client.abc123.js' (when STATIC_URL set)
+    - asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css' (when STATIC_URL not set)
    """
    if not logical_path:
        return logical_path
    try:
        from common.utils.asset_manifest import resolve_asset
-        resolved = resolve_asset(logical_path)
-        if not resolved:
-            return f"/static/{logical_path.lstrip('/')}"
-        # If resolved is already an absolute URL starting with /static or http(s), return as is
-        if resolved.startswith('/static/') or resolved.startswith('http://') or resolved.startswith('https://'):
+        # Resolve logical to possibly hashed path
+        resolved = resolve_asset(logical_path) or logical_path
+
+        # If manifest returns an absolute URL, return as-is
+        if resolved.startswith('http://') or resolved.startswith('https://'):
            return resolved
-        # If it starts with 'dist/', prefix /static/
-        if resolved.startswith('dist/'):
-            return '/static/' + resolved
-        # Otherwise, best effort: ensure it lives under /static/
-        return '/static/' + resolved.lstrip('/')
+
+        # Normalize: strip any leading '/static/' and leading '/'
+        if resolved.startswith('/static/'):
+            rel = resolved[len('/static/'):]
+        else:
+            rel = resolved.lstrip('/')
+
+        # Build with STATIC_URL if configured
+        static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/')
+        if static_base:
+            return f"{static_base}/{rel}"
+        # Fallback to app static
+        return f"/static/{rel}"
    except Exception:
-        return f"/static/{logical_path.lstrip('/')}"
+        # Conservative fallback also respecting STATIC_URL
+        static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/')
+        rel = logical_path.lstrip('/')
+        return f"{static_base}/{rel}" if static_base else f"/static/{rel}"


 def register_filters(app):
--- a/config/config.py
+++ b/config/config.py
@@ -439,6 +439,10 @@ class StagingConfig(Config):
    MINIO_SECRET_KEY = environ.get('MINIO_SECRET_KEY')
    MINIO_USE_HTTPS = True

+    # Push gateway grouping elements
+    pod_name = os.getenv('POD_NAME')
+    pod_namespace = os.getenv('POD_NAMESPACE')
+

 class ProdConfig(Config):
    DEVELOPMENT = False
--- a/config/static-manifest/manifest.json
+++ b/config/static-manifest/manifest.json
@@ -1,6 +1,6 @@
 {
-  "dist/chat-client.js": "dist/chat-client.25888758.js",
-  "dist/chat-client.css": "dist/chat-client.eef0ef31.css",
+  "dist/chat-client.js": "dist/chat-client.6bfbd765.js",
+  "dist/chat-client.css": "dist/chat-client.33f904ba.css",
  "dist/main.js": "dist/main.f3dde0f6.js",
  "dist/main.css": "dist/main.c40e57ad.css"
 }
--- a/documentation/PUSHGATEWAY_GROUPING.md
+++ b/documentation/PUSHGATEWAY_GROUPING.md
@@ -0,0 +1,79 @@
+# Pushgateway Grouping Keys (instance, namespace, process)
+
+Goal: prevent metrics pushed by different Pods or worker processes from overwriting each other, while keeping Prometheus/Grafana queries simple.
+
+Summary of decisions
+- WORKER_ID source = OS process ID (PID)
+- Always include namespace in grouping labels
+
+What this changes
+- Every push to Prometheus Pushgateway now includes a grouping_key with:
+  - instance = POD_NAME (fallback to HOSTNAME, then "dev")
+  - namespace = POD_NAMESPACE (fallback to ENVIRONMENT, then "dev")
+  - process = WORKER_ID (fallback to current PID)
+- Prometheus will expose these as exported_instance, exported_namespace, and exported_process on the scraped series.
+
+Code changes (already implemented)
+- common/utils/business_event.py
+  - push_to_gateway(..., grouping_key={instance, namespace, process})
+  - Safe fallbacks ensure dev/test (Podman) keeps working with no K8s-specific env vars.
+
+Kubernetes manifests (already implemented)
+- All Deployments that push metrics set env vars via Downward API:
+  - POD_NAME from metadata.name
+  - POD_NAMESPACE from metadata.namespace
+- Files updated:
+  - scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
+  - scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
+  - scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
+  - scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
+  - scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
+  - scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
+
+No changes needed to secrets
+- PUSH_GATEWAY_HOST/PORT remain provided via eveai-secrets; code composes PUSH_GATEWAY_URL internally.
+
+How to verify
+1) Pushgateway contains per-pod/process groups
+   - Port-forward Pushgateway (namespace monitoring):
+     - kubectl -n monitoring port-forward svc/monitoring-pushgateway-prometheus-pushgateway 9091:9091
+   - Inspect:
+     - curl -s http://127.0.0.1:9091/api/v1/metrics | jq '.[].labels'
+     - You should see labels including job (your service), instance (pod), namespace, process (pid).
+
+2) Prometheus shows the labels as exported_*
+   - Port-forward Prometheus (namespace monitoring):
+     - kubectl -n monitoring port-forward svc/monitoring-prometheus 9090:9090
+   - Queries:
+     - label_values(eveai_llm_calls_total, exported_instance)
+     - label_values(eveai_llm_calls_total, exported_namespace)
+     - label_values(eveai_llm_calls_total, exported_process)
+
+PromQL query patterns
+- Hide per-process by aggregating away exported_process:
+  - sum without(exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_instance, exported_namespace)
+- Service-level totals (hide instance and process):
+  - sum without(exported_instance, exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_namespace)
+- Histogram example (p95 per service):
+  - histogram_quantile(0.95, sum without(exported_process) (rate(eveai_llm_duration_seconds_bucket[5m])) by (le, exported_job, exported_namespace))
+
+Dev/Test (Podman) behavior
+- No Kubernetes Downward API: POD_NAME/POD_NAMESPACE are not set.
+- Fallbacks used by the code:
+  - instance = HOSTNAME if available, else "dev"
+  - namespace = ENVIRONMENT if available, else "dev"
+  - process = current PID
+- This guarantees no crashes and still avoids process-level overwrites.
+
+Operational notes
+- Cardinality: adding process creates more series (one per worker). This is required to avoid data loss when multiple workers push concurrently. Dashboards should aggregate away exported_process unless you need per-worker detail.
+- Batch jobs (future): use the same grouping and consider delete_from_gateway on successful completion to remove stale groups for that job/instance/process.
+
+Troubleshooting
+- If you still see overwriting:
+  - Confirm that instance, namespace, and process all appear in Pushgateway JSON labels for each group.
+  - Ensure that all pods set POD_NAME and POD_NAMESPACE (kubectl -n eveai-staging exec <pod> -- env | egrep "POD_NAME|POD_NAMESPACE").
+  - Verify that your app processes run push_to_gateway through the shared business_event wrapper.
+
+Change log reference
+- Implemented on 2025-09-26 by adding grouping_key in business_event push and env vars in Deployments.
--- a/Setup/cluster-install.md
+++ b/Setup/cluster-install.md
@@ -119,7 +119,7 @@ helm search repo prometheus-community/kube-prometheus-stack

 #### Create Monitoring Values File

-Create `scaleway/manifests/base/monitoring/prometheus-values.yaml`:
+Create `scaleway/manifests/base/monitoring/values-monitoring.yaml`:

 #### Deploy Monitoring Stack

@@ -133,7 +133,8 @@ helm install monitoring prometheus-community/kube-prometheus-stack \
 # Install pushgateway
 helm install monitoring-pushgateway prometheus-community/prometheus-pushgateway \
  -n monitoring --create-namespace \
-  --set serviceMonitor.enabled=true
+  --set serviceMonitor.enabled=true \
+  --set serviceMonitor.additionalLabels.release=monitoring

 # Monitor deployment progress
 kubectl get pods -n monitoring -w
--- a/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
@@ -54,6 +54,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
        volumeMounts:
        - name: logs-volume
          mountPath: /app/logs
--- a/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
@@ -54,6 +54,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "100m"
--- a/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
@@ -54,6 +54,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "200m"
--- a/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
@@ -49,6 +49,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "150m"
--- a/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
@@ -49,6 +49,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "150m"
--- a/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
@@ -49,6 +49,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "150m"
--- a/scaleway/manifests/base/monitoring/prometheus-values.yaml
+++ b/scaleway/manifests/base/monitoring/prometheus-values.yaml
@@ -1,71 +0,0 @@
-# prometheus-values.yaml
-# Global settings
-fullnameOverride: "monitoring"
-
-# Prometheus configuration
-prometheus:
-  prometheusSpec:
-    retention: 15d
-    resources:
-      limits:
-        cpu: 500m
-        memory: 2Gi
-      requests:
-        cpu: 100m
-        memory: 512Mi
-    storageSpec:
-      volumeClaimTemplate:
-        spec:
-          accessModes: ["ReadWriteOnce"]
-          resources:
-            requests:
-              storage: 10Gi
-
-# Grafana configuration
-grafana:
-  enabled: true
-  adminPassword: "admin123"  # Change this for production
-  resources:
-    limits:
-      cpu: 200m
-      memory: 256Mi
-    requests:
-      cpu: 50m
-      memory: 128Mi
-  persistence:
-    enabled: true
-    size: 2Gi
-
-# AlertManager configuration
-alertmanager:
-  alertmanagerSpec:
-    resources:
-      limits:
-        cpu: 100m
-        memory: 256Mi
-      requests:
-        cpu: 10m
-        memory: 64Mi
-    storage:
-      volumeClaimTemplate:
-        spec:
-          accessModes: ["ReadWriteOnce"]
-          resources:
-            requests:
-              storage: 2Gi
-
-# Node Exporter
-nodeExporter:
-  enabled: true
-
-# Kube State Metrics
-kubeStateMetrics:
-  enabled: true
-
-# Disable components you might not need in staging
-kubeEtcd:
-  enabled: false
-kubeScheduler:
-  enabled: false
-kubeControllerManager:
-  enabled: false
--- a/scaleway/manifests/base/monitoring/values-monitoring.yaml
+++ b/scaleway/manifests/base/monitoring/values-monitoring.yaml
@@ -9,7 +9,7 @@ global:
 # Prometheus configuration
 prometheus:
  prometheusSpec:
-    retention: 30d
+    retention: 7d
    storageSpec:
      volumeClaimTemplate:
        spec:
@@ -17,21 +17,7 @@ prometheus:
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
-              storage: 50Gi
-
-    # External services monitoring (Scaleway managed services)
-    additionalScrapeConfigs:
-      - job_name: 'scaleway-redis'
-        static_configs:
-          - targets: ['redis-endpoint:6379']
-        metrics_path: /metrics
-        scrape_interval: 30s
-
-      - job_name: 'scaleway-postgresql'
-        static_configs:
-          - targets: ['postgres-endpoint:5432']
-        metrics_path: /metrics
-        scrape_interval: 30s
+              storage: 5Gi

    # Resource limits
    resources:
@@ -48,7 +34,7 @@ grafana:
  persistence:
    enabled: true
    storageClassName: scw-bssd
-    size: 10Gi
+    size: 2Gi

  # Resource limits
  resources:
@@ -97,7 +83,7 @@ alertmanager:
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
-              storage: 10Gi
+              storage: 1Gi
    resources:
      requests:
        memory: 128Mi