Merge branch 'feature/Activate_Pushgateway_Scraping' into develop

2025-09-30 15:23:59 +02:00
parent a76f87ba75 030d1b0e90
commit 471b8dd8c3
15 changed files with 204 additions and 109 deletions
--- a/common/utils/business_event.py
+++ b/common/utils/business_event.py
@@ -559,12 +559,24 @@ class BusinessEvent:
            self._log_buffer = []
    def _push_to_gateway(self):
-        # Push metrics to the gateway
+        # Push metrics to the gateway with grouping key to avoid overwrites across pods/processes
        try:
            # Determine grouping labels
            pod_name = current_app.config.get('POD_NAME', current_app.config.get('COMPONENT_NAME', 'dev'))
            pod_namespace = current_app.config.get('POD_NAMESPACE', current_app.config.get('FLASK_ENV', 'dev'))
            worker_id = str(os.getpid())
            grouping_key = {
                'instance': pod_name,
                'namespace': pod_namespace,
                'process': worker_id,
            }
            push_to_gateway(
                current_app.config['PUSH_GATEWAY_URL'],
                job=current_app.config['COMPONENT_NAME'],
-                registry=REGISTRY
+                registry=REGISTRY,
                grouping_key=grouping_key,
            )
        except Exception as e:
            current_app.logger.error(f"Failed to push metrics to Prometheus Push Gateway: {e}")
--- a/common/utils/template_filters.py
+++ b/common/utils/template_filters.py
@@ -110,28 +110,39 @@ def get_pagination_html(pagination, endpoint, **kwargs):
 def asset_url(logical_path: str):
    """
    Resolve an asset logical path to a hashed URL using Parcel manifest when available.
-    Fallback to the original logical path under /static/ if manifest is missing.
+    Return a URL that respects STATIC_URL (CDN) when configured; otherwise serve from /static/.
    Examples:
-    - asset_url('dist/chat-client.js') -> '/static/dist/chat-client.abc123.js'
+    - asset_url('dist/chat-client.js') -> 'https://cdn/.../dist/chat-client.abc123.js' (when STATIC_URL set)
-    - asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css'
+    - asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css' (when STATIC_URL not set)
    """
    if not logical_path:
        return logical_path
    try:
        from common.utils.asset_manifest import resolve_asset
-        resolved = resolve_asset(logical_path)
+        # Resolve logical to possibly hashed path
-        if not resolved:
+        resolved = resolve_asset(logical_path) or logical_path
-            return f"/static/{logical_path.lstrip('/')}"
+
-        # If resolved is already an absolute URL starting with /static or http(s), return as is
+        # If manifest returns an absolute URL, return as-is
-        if resolved.startswith('/static/') or resolved.startswith('http://') or resolved.startswith('https://'):
+        if resolved.startswith('http://') or resolved.startswith('https://'):
            return resolved
-        # If it starts with 'dist/', prefix /static/
+
-        if resolved.startswith('dist/'):
+        # Normalize: strip any leading '/static/' and leading '/'
-            return '/static/' + resolved
+        if resolved.startswith('/static/'):
-        # Otherwise, best effort: ensure it lives under /static/
+            rel = resolved[len('/static/'):]
-        return '/static/' + resolved.lstrip('/')
+        else:
            rel = resolved.lstrip('/')
        # Build with STATIC_URL if configured
        static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/')
        if static_base:
            return f"{static_base}/{rel}"
        # Fallback to app static
        return f"/static/{rel}"
    except Exception:
-        return f"/static/{logical_path.lstrip('/')}"
+        # Conservative fallback also respecting STATIC_URL
        static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/')
        rel = logical_path.lstrip('/')
        return f"{static_base}/{rel}" if static_base else f"/static/{rel}"
 def register_filters(app):
--- a/config/config.py
+++ b/config/config.py
@@ -439,6 +439,10 @@ class StagingConfig(Config):
    MINIO_SECRET_KEY = environ.get('MINIO_SECRET_KEY')
    MINIO_USE_HTTPS = True
    # Push gateway grouping elements
    pod_name = os.getenv('POD_NAME')
    pod_namespace = os.getenv('POD_NAMESPACE')
 class ProdConfig(Config):
    DEVELOPMENT = False
--- a/config/static-manifest/manifest.json
+++ b/config/static-manifest/manifest.json
@@ -1,6 +1,6 @@
 {
-  "dist/chat-client.js": "dist/chat-client.25888758.js",
+  "dist/chat-client.js": "dist/chat-client.6bfbd765.js",
-  "dist/chat-client.css": "dist/chat-client.eef0ef31.css",
+  "dist/chat-client.css": "dist/chat-client.33f904ba.css",
  "dist/main.js": "dist/main.f3dde0f6.js",
  "dist/main.css": "dist/main.c40e57ad.css"
 }
--- a/documentation/PUSHGATEWAY_GROUPING.md
+++ b/documentation/PUSHGATEWAY_GROUPING.md
@@ -0,0 +1,79 @@
 # Pushgateway Grouping Keys (instance, namespace, process)
 Goal: prevent metrics pushed by different Pods or worker processes from overwriting each other, while keeping Prometheus/Grafana queries simple.
 Summary of decisions
 - WORKER_ID source = OS process ID (PID)
 - Always include namespace in grouping labels
 What this changes
 - Every push to Prometheus Pushgateway now includes a grouping_key with:
  - instance = POD_NAME (fallback to HOSTNAME, then "dev")
  - namespace = POD_NAMESPACE (fallback to ENVIRONMENT, then "dev")
  - process = WORKER_ID (fallback to current PID)
 - Prometheus will expose these as exported_instance, exported_namespace, and exported_process on the scraped series.
 Code changes (already implemented)
 - common/utils/business_event.py
  - push_to_gateway(..., grouping_key={instance, namespace, process})
  - Safe fallbacks ensure dev/test (Podman) keeps working with no K8s-specific env vars.
 Kubernetes manifests (already implemented)
 - All Deployments that push metrics set env vars via Downward API:
  - POD_NAME from metadata.name
  - POD_NAMESPACE from metadata.namespace
 - Files updated:
  - scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
  - scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
  - scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
  - scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
  - scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
  - scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
 No changes needed to secrets
 - PUSH_GATEWAY_HOST/PORT remain provided via eveai-secrets; code composes PUSH_GATEWAY_URL internally.
 How to verify
 1) Pushgateway contains per-pod/process groups
   - Port-forward Pushgateway (namespace monitoring):
     - kubectl -n monitoring port-forward svc/monitoring-pushgateway-prometheus-pushgateway 9091:9091
   - Inspect:
     - curl -s http://127.0.0.1:9091/api/v1/metrics | jq '.[].labels'
     - You should see labels including job (your service), instance (pod), namespace, process (pid).
 2) Prometheus shows the labels as exported_*
   - Port-forward Prometheus (namespace monitoring):
     - kubectl -n monitoring port-forward svc/monitoring-prometheus 9090:9090
   - Queries:
     - label_values(eveai_llm_calls_total, exported_instance)
     - label_values(eveai_llm_calls_total, exported_namespace)
     - label_values(eveai_llm_calls_total, exported_process)
 PromQL query patterns
 - Hide per-process by aggregating away exported_process:
  - sum without(exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_instance, exported_namespace)
 - Service-level totals (hide instance and process):
  - sum without(exported_instance, exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_namespace)
 - Histogram example (p95 per service):
  - histogram_quantile(0.95, sum without(exported_process) (rate(eveai_llm_duration_seconds_bucket[5m])) by (le, exported_job, exported_namespace))
 Dev/Test (Podman) behavior
 - No Kubernetes Downward API: POD_NAME/POD_NAMESPACE are not set.
 - Fallbacks used by the code:
  - instance = HOSTNAME if available, else "dev"
  - namespace = ENVIRONMENT if available, else "dev"
  - process = current PID
 - This guarantees no crashes and still avoids process-level overwrites.
 Operational notes
 - Cardinality: adding process creates more series (one per worker). This is required to avoid data loss when multiple workers push concurrently. Dashboards should aggregate away exported_process unless you need per-worker detail.
 - Batch jobs (future): use the same grouping and consider delete_from_gateway on successful completion to remove stale groups for that job/instance/process.
 Troubleshooting
 - If you still see overwriting:
  - Confirm that instance, namespace, and process all appear in Pushgateway JSON labels for each group.
  - Ensure that all pods set POD_NAME and POD_NAMESPACE (kubectl -n eveai-staging exec <pod> -- env | egrep "POD_NAME|POD_NAMESPACE").
  - Verify that your app processes run push_to_gateway through the shared business_event wrapper.
 Change log reference
 - Implemented on 2025-09-26 by adding grouping_key in business_event push and env vars in Deployments.
--- a/Setup/cluster-install.md
+++ b/Setup/cluster-install.md
@@ -119,7 +119,7 @@ helm search repo prometheus-community/kube-prometheus-stack
 #### Create Monitoring Values File
-Create `scaleway/manifests/base/monitoring/prometheus-values.yaml`:
+Create `scaleway/manifests/base/monitoring/values-monitoring.yaml`:
 #### Deploy Monitoring Stack
@@ -133,7 +133,8 @@ helm install monitoring prometheus-community/kube-prometheus-stack \
 # Install pushgateway
 helm install monitoring-pushgateway prometheus-community/prometheus-pushgateway \
  -n monitoring --create-namespace \
-  --set serviceMonitor.enabled=true
+  --set serviceMonitor.enabled=true \
  --set serviceMonitor.additionalLabels.release=monitoring
 # Monitor deployment progress
 kubectl get pods -n monitoring -w
--- a/scaleway/clean-monitoring.sh
+++ b/scaleway/clean-monitoring.sh
@@ -0,0 +1,25 @@
 # 1. Verwijder alle ClusterRoles van vorige installatie
 kubectl delete clusterrole monitoring-grafana-clusterrole --ignore-not-found=true
 kubectl delete clusterrole monitoring-kube-prometheus-admission --ignore-not-found=true
 kubectl delete clusterrole monitoring-kube-prometheus-operator --ignore-not-found=true
 kubectl delete clusterrole monitoring-kube-prometheus-prometheus --ignore-not-found=true
 kubectl delete clusterrole monitoring-kube-state-metrics --ignore-not-found=true
 # 2. Verwijder ClusterRoleBindings
 kubectl delete clusterrolebinding monitoring-grafana-clusterrolebinding --ignore-not-found=true
 kubectl delete clusterrolebinding monitoring-kube-prometheus-admission --ignore-not-found=true
 kubectl delete clusterrolebinding monitoring-kube-prometheus-operator --ignore-not-found=true
 kubectl delete clusterrolebinding monitoring-kube-prometheus-prometheus --ignore-not-found=true
 kubectl delete clusterrolebinding monitoring-kube-state-metrics --ignore-not-found=true
 # 3. Verwijder eventuele webhook configurations
 kubectl delete mutatingwebhookconfiguration monitoring-kube-prometheus-admission --ignore-not-found=true
 kubectl delete validatingwebhookconfiguration monitoring-kube-prometheus-admission --ignore-not-found=true
 # 4. Check voor andere monitoring resources
 kubectl get clusterroles | grep monitoring
 kubectl get clusterrolebindings | grep monitoring
 # 5. Als er nog resources zijn, verwijder ze:
 kubectl get clusterroles | grep monitoring | awk '{print $1}' | xargs -r kubectl delete clusterrole
 kubectl get clusterrolebindings | grep monitoring | awk '{print $1}' | xargs -r kubectl delete clusterrolebinding
--- a/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
@@ -54,6 +54,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        volumeMounts:
        - name: logs-volume
          mountPath: /app/logs
--- a/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
@@ -54,6 +54,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "100m"
--- a/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
@@ -54,6 +54,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "200m"
--- a/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
@@ -49,6 +49,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "150m"
--- a/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
@@ -49,6 +49,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "150m"
--- a/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
@@ -49,6 +49,14 @@ spec:
              name: eveai-secrets
              key: PUSH_GATEWAY_PORT
              optional: true
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        resources:
          requests:
            cpu: "150m"
--- a/scaleway/manifests/base/monitoring/prometheus-values.yaml
+++ b/scaleway/manifests/base/monitoring/prometheus-values.yaml
@@ -1,71 +0,0 @@
 # prometheus-values.yaml
 # Global settings
 fullnameOverride: "monitoring"
 # Prometheus configuration
 prometheus:
  prometheusSpec:
    retention: 15d
    resources:
      limits:
        cpu: 500m
        memory: 2Gi
      requests:
        cpu: 100m
        memory: 512Mi
    storageSpec:
      volumeClaimTemplate:
        spec:
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 10Gi
 # Grafana configuration
 grafana:
  enabled: true
  adminPassword: "admin123"  # Change this for production
  resources:
    limits:
      cpu: 200m
      memory: 256Mi
    requests:
      cpu: 50m
      memory: 128Mi
  persistence:
    enabled: true
    size: 2Gi
 # AlertManager configuration
 alertmanager:
  alertmanagerSpec:
    resources:
      limits:
        cpu: 100m
        memory: 256Mi
      requests:
        cpu: 10m
        memory: 64Mi
    storage:
      volumeClaimTemplate:
        spec:
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 2Gi
 # Node Exporter
 nodeExporter:
  enabled: true
 # Kube State Metrics
 kubeStateMetrics:
  enabled: true
 # Disable components you might not need in staging
 kubeEtcd:
  enabled: false
 kubeScheduler:
  enabled: false
 kubeControllerManager:
  enabled: false
--- a/scaleway/manifests/base/monitoring/values-monitoring.yaml
+++ b/scaleway/manifests/base/monitoring/values-monitoring.yaml
@@ -9,7 +9,7 @@ global:
 # Prometheus configuration
 prometheus:
  prometheusSpec:
-    retention: 30d
+    retention: 7d
    storageSpec:
      volumeClaimTemplate:
        spec:
@@ -17,21 +17,7 @@ prometheus:
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
-              storage: 50Gi
+              storage: 5Gi
    # External services monitoring (Scaleway managed services)
    additionalScrapeConfigs:
      - job_name: 'scaleway-redis'
        static_configs:
          - targets: ['redis-endpoint:6379']
        metrics_path: /metrics
        scrape_interval: 30s
      - job_name: 'scaleway-postgresql'
        static_configs:
          - targets: ['postgres-endpoint:5432']
        metrics_path: /metrics
        scrape_interval: 30s
    # Resource limits
    resources:
@@ -48,7 +34,7 @@ grafana:
  persistence:
    enabled: true
    storageClassName: scw-bssd
-    size: 10Gi
+    size: 2Gi
  # Resource limits
  resources:
@@ -97,7 +83,7 @@ alertmanager:
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
-              storage: 10Gi
+              storage: 1Gi
    resources:
      requests:
        memory: 128Mi