From fa452e4934297649921b4a7330328d7827c447de Mon Sep 17 00:00:00 2001
From: Josako <pieter.laroy@flow-it.net>
Date: Tue, 30 Sep 2025 14:56:08 +0200
Subject: [PATCH 1/2] - Change manifests for Prometheus installation - Change
 instructions for deploying Prometheus stack and Pushgateway - Additional
 grouping to pushgateway to avoid overwriting of metrics in different pods /
 processes - Bugfix to ensure good retrieval of css en js files in eveai_app

---
 common/utils/business_event.py                | 16 +++-
 common/utils/template_filters.py              | 39 +++++----
 config/config.py                              |  4 +
 config/static-manifest/manifest.json          |  4 +-
 documentation/PUSHGATEWAY_GROUPING.md         | 79 +++++++++++++++++++
 .../Production Setup/cluster-install.md       |  5 +-
 .../eveai-chat-workers/deployment.yaml        |  8 ++
 .../eveai-entitlements/deployment.yaml        |  8 ++
 .../backend/eveai-workers/deployment.yaml     |  8 ++
 .../frontend/eveai-api/deployment.yaml        |  8 ++
 .../frontend/eveai-app/deployment.yaml        |  8 ++
 .../eveai-chat-client/deployment.yaml         |  8 ++
 .../base/monitoring/prometheus-values.yaml    | 71 -----------------
 .../base/monitoring/values-monitoring.yaml    | 22 +-----
 14 files changed, 179 insertions(+), 109 deletions(-)
 create mode 100644 documentation/PUSHGATEWAY_GROUPING.md
 delete mode 100644 scaleway/manifests/base/monitoring/prometheus-values.yaml

diff --git a/common/utils/business_event.py b/common/utils/business_event.py
index a505a89..2d42329 100644
--- a/common/utils/business_event.py
+++ b/common/utils/business_event.py
@@ -559,12 +559,24 @@ class BusinessEvent:
             self._log_buffer = []
 
     def _push_to_gateway(self):
-        # Push metrics to the gateway
+        # Push metrics to the gateway with grouping key to avoid overwrites across pods/processes
         try:
+            # Determine grouping labels
+            pod_name = current_app.config.get('POD_NAME', current_app.config.get('COMPONENT_NAME', 'dev'))
+            pod_namespace = current_app.config.get('POD_NAMESPACE', current_app.config.get('FLASK_ENV', 'dev'))
+            worker_id = str(os.getpid())
+
+            grouping_key = {
+                'instance': pod_name,
+                'namespace': pod_namespace,
+                'process': worker_id,
+            }
+
             push_to_gateway(
                 current_app.config['PUSH_GATEWAY_URL'],
                 job=current_app.config['COMPONENT_NAME'],
-                registry=REGISTRY
+                registry=REGISTRY,
+                grouping_key=grouping_key,
             )
         except Exception as e:
             current_app.logger.error(f"Failed to push metrics to Prometheus Push Gateway: {e}")
diff --git a/common/utils/template_filters.py b/common/utils/template_filters.py
index 8eff017..ca7106a 100644
--- a/common/utils/template_filters.py
+++ b/common/utils/template_filters.py
@@ -110,28 +110,39 @@ def get_pagination_html(pagination, endpoint, **kwargs):
 def asset_url(logical_path: str):
     """
     Resolve an asset logical path to a hashed URL using Parcel manifest when available.
-    Fallback to the original logical path under /static/ if manifest is missing.
+    Return a URL that respects STATIC_URL (CDN) when configured; otherwise serve from /static/.
     Examples:
-    - asset_url('dist/chat-client.js') -> '/static/dist/chat-client.abc123.js'
-    - asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css'
+    - asset_url('dist/chat-client.js') -> 'https://cdn/.../dist/chat-client.abc123.js' (when STATIC_URL set)
+    - asset_url('dist/chat-client.css') -> '/static/dist/chat-client.def456.css' (when STATIC_URL not set)
     """
     if not logical_path:
         return logical_path
     try:
         from common.utils.asset_manifest import resolve_asset
-        resolved = resolve_asset(logical_path)
-        if not resolved:
-            return f"/static/{logical_path.lstrip('/')}"
-        # If resolved is already an absolute URL starting with /static or http(s), return as is
-        if resolved.startswith('/static/') or resolved.startswith('http://') or resolved.startswith('https://'):
+        # Resolve logical to possibly hashed path
+        resolved = resolve_asset(logical_path) or logical_path
+
+        # If manifest returns an absolute URL, return as-is
+        if resolved.startswith('http://') or resolved.startswith('https://'):
             return resolved
-        # If it starts with 'dist/', prefix /static/
-        if resolved.startswith('dist/'):
-            return '/static/' + resolved
-        # Otherwise, best effort: ensure it lives under /static/
-        return '/static/' + resolved.lstrip('/')
+
+        # Normalize: strip any leading '/static/' and leading '/'
+        if resolved.startswith('/static/'):
+            rel = resolved[len('/static/'):]
+        else:
+            rel = resolved.lstrip('/')
+
+        # Build with STATIC_URL if configured
+        static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/')
+        if static_base:
+            return f"{static_base}/{rel}"
+        # Fallback to app static
+        return f"/static/{rel}"
     except Exception:
-        return f"/static/{logical_path.lstrip('/')}"
+        # Conservative fallback also respecting STATIC_URL
+        static_base = (current_app.config.get('STATIC_URL') or '').rstrip('/')
+        rel = logical_path.lstrip('/')
+        return f"{static_base}/{rel}" if static_base else f"/static/{rel}"
 
 
 def register_filters(app):
diff --git a/config/config.py b/config/config.py
index a384a96..f3d9737 100644
--- a/config/config.py
+++ b/config/config.py
@@ -439,6 +439,10 @@ class StagingConfig(Config):
     MINIO_SECRET_KEY = environ.get('MINIO_SECRET_KEY')
     MINIO_USE_HTTPS = True
 
+    # Push gateway grouping elements
+    pod_name = os.getenv('POD_NAME')
+    pod_namespace = os.getenv('POD_NAMESPACE')
+
 
 class ProdConfig(Config):
     DEVELOPMENT = False
diff --git a/config/static-manifest/manifest.json b/config/static-manifest/manifest.json
index a28e9ca..e401916 100644
--- a/config/static-manifest/manifest.json
+++ b/config/static-manifest/manifest.json
@@ -1,6 +1,6 @@
 {
-  "dist/chat-client.js": "dist/chat-client.25888758.js",
-  "dist/chat-client.css": "dist/chat-client.eef0ef31.css",
+  "dist/chat-client.js": "dist/chat-client.6bfbd765.js",
+  "dist/chat-client.css": "dist/chat-client.33f904ba.css",
   "dist/main.js": "dist/main.f3dde0f6.js",
   "dist/main.css": "dist/main.c40e57ad.css"
 }
\ No newline at end of file
diff --git a/documentation/PUSHGATEWAY_GROUPING.md b/documentation/PUSHGATEWAY_GROUPING.md
new file mode 100644
index 0000000..515f2df
--- /dev/null
+++ b/documentation/PUSHGATEWAY_GROUPING.md
@@ -0,0 +1,79 @@
+# Pushgateway Grouping Keys (instance, namespace, process)
+
+Goal: prevent metrics pushed by different Pods or worker processes from overwriting each other, while keeping Prometheus/Grafana queries simple.
+
+Summary of decisions
+- WORKER_ID source = OS process ID (PID)
+- Always include namespace in grouping labels
+
+What this changes
+- Every push to Prometheus Pushgateway now includes a grouping_key with:
+  - instance = POD_NAME (fallback to HOSTNAME, then "dev")
+  - namespace = POD_NAMESPACE (fallback to ENVIRONMENT, then "dev")
+  - process = WORKER_ID (fallback to current PID)
+- Prometheus will expose these as exported_instance, exported_namespace, and exported_process on the scraped series.
+
+Code changes (already implemented)
+- common/utils/business_event.py
+  - push_to_gateway(..., grouping_key={instance, namespace, process})
+  - Safe fallbacks ensure dev/test (Podman) keeps working with no K8s-specific env vars.
+
+Kubernetes manifests (already implemented)
+- All Deployments that push metrics set env vars via Downward API:
+  - POD_NAME from metadata.name
+  - POD_NAMESPACE from metadata.namespace
+- Files updated:
+  - scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
+  - scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
+  - scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
+  - scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
+  - scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
+  - scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
+
+No changes needed to secrets
+- PUSH_GATEWAY_HOST/PORT remain provided via eveai-secrets; code composes PUSH_GATEWAY_URL internally.
+
+How to verify
+1) Pushgateway contains per-pod/process groups
+   - Port-forward Pushgateway (namespace monitoring):
+     - kubectl -n monitoring port-forward svc/monitoring-pushgateway-prometheus-pushgateway 9091:9091
+   - Inspect:
+     - curl -s http://127.0.0.1:9091/api/v1/metrics | jq '.[].labels'
+     - You should see labels including job (your service), instance (pod), namespace, process (pid).
+
+2) Prometheus shows the labels as exported_*
+   - Port-forward Prometheus (namespace monitoring):
+     - kubectl -n monitoring port-forward svc/monitoring-prometheus 9090:9090
+   - Queries:
+     - label_values(eveai_llm_calls_total, exported_instance)
+     - label_values(eveai_llm_calls_total, exported_namespace)
+     - label_values(eveai_llm_calls_total, exported_process)
+
+PromQL query patterns
+- Hide per-process by aggregating away exported_process:
+  - sum without(exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_instance, exported_namespace)
+- Service-level totals (hide instance and process):
+  - sum without(exported_instance, exported_process) (rate(eveai_llm_calls_total[5m])) by (exported_job, exported_namespace)
+- Histogram example (p95 per service):
+  - histogram_quantile(0.95, sum without(exported_process) (rate(eveai_llm_duration_seconds_bucket[5m])) by (le, exported_job, exported_namespace))
+
+Dev/Test (Podman) behavior
+- No Kubernetes Downward API: POD_NAME/POD_NAMESPACE are not set.
+- Fallbacks used by the code:
+  - instance = HOSTNAME if available, else "dev"
+  - namespace = ENVIRONMENT if available, else "dev"
+  - process = current PID
+- This guarantees no crashes and still avoids process-level overwrites.
+
+Operational notes
+- Cardinality: adding process creates more series (one per worker). This is required to avoid data loss when multiple workers push concurrently. Dashboards should aggregate away exported_process unless you need per-worker detail.
+- Batch jobs (future): use the same grouping and consider delete_from_gateway on successful completion to remove stale groups for that job/instance/process.
+
+Troubleshooting
+- If you still see overwriting:
+  - Confirm that instance, namespace, and process all appear in Pushgateway JSON labels for each group.
+  - Ensure that all pods set POD_NAME and POD_NAMESPACE (kubectl -n eveai-staging exec <pod> -- env | egrep "POD_NAME|POD_NAMESPACE").
+  - Verify that your app processes run push_to_gateway through the shared business_event wrapper.
+
+Change log reference
+- Implemented on 2025-09-26 by adding grouping_key in business_event push and env vars in Deployments.
diff --git a/documentation/Production Setup/cluster-install.md b/documentation/Production Setup/cluster-install.md
index 97974ad..7186662 100644
--- a/documentation/Production Setup/cluster-install.md	
+++ b/documentation/Production Setup/cluster-install.md	
@@ -119,7 +119,7 @@ helm search repo prometheus-community/kube-prometheus-stack
 
 #### Create Monitoring Values File
 
-Create `scaleway/manifests/base/monitoring/prometheus-values.yaml`:
+Create `scaleway/manifests/base/monitoring/values-monitoring.yaml`:
 
 #### Deploy Monitoring Stack
 
@@ -133,7 +133,8 @@ helm install monitoring prometheus-community/kube-prometheus-stack \
 # Install pushgateway
 helm install monitoring-pushgateway prometheus-community/prometheus-pushgateway \
   -n monitoring --create-namespace \
-  --set serviceMonitor.enabled=true
+  --set serviceMonitor.enabled=true \
+  --set serviceMonitor.additionalLabels.release=monitoring
 
 # Monitor deployment progress
 kubectl get pods -n monitoring -w
diff --git a/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml b/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
index 26b9bcf..73a2c4e 100644
--- a/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-chat-workers/deployment.yaml
@@ -54,6 +54,14 @@ spec:
               name: eveai-secrets
               key: PUSH_GATEWAY_PORT
               optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
         volumeMounts:
         - name: logs-volume
           mountPath: /app/logs
diff --git a/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml b/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
index 1b6a6cc..93cea7b 100644
--- a/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-entitlements/deployment.yaml
@@ -54,6 +54,14 @@ spec:
               name: eveai-secrets
               key: PUSH_GATEWAY_PORT
               optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
         resources:
           requests:
             cpu: "100m"
diff --git a/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml b/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
index 7c220c1..016e00e 100644
--- a/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
+++ b/scaleway/manifests/base/applications/backend/eveai-workers/deployment.yaml
@@ -54,6 +54,14 @@ spec:
               name: eveai-secrets
               key: PUSH_GATEWAY_PORT
               optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
         resources:
           requests:
             cpu: "200m"
diff --git a/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml b/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
index 2b785ad..431bfe7 100644
--- a/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-api/deployment.yaml
@@ -49,6 +49,14 @@ spec:
               name: eveai-secrets
               key: PUSH_GATEWAY_PORT
               optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
         resources:
           requests:
             cpu: "150m"
diff --git a/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml b/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
index 7c0e414..e35e13a 100644
--- a/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-app/deployment.yaml
@@ -49,6 +49,14 @@ spec:
               name: eveai-secrets
               key: PUSH_GATEWAY_PORT
               optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
         resources:
           requests:
             cpu: "150m"
diff --git a/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml b/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
index 7103b92..26c64eb 100644
--- a/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
+++ b/scaleway/manifests/base/applications/frontend/eveai-chat-client/deployment.yaml
@@ -49,6 +49,14 @@ spec:
               name: eveai-secrets
               key: PUSH_GATEWAY_PORT
               optional: true
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
         resources:
           requests:
             cpu: "150m"
diff --git a/scaleway/manifests/base/monitoring/prometheus-values.yaml b/scaleway/manifests/base/monitoring/prometheus-values.yaml
deleted file mode 100644
index 03d393f..0000000
--- a/scaleway/manifests/base/monitoring/prometheus-values.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-# prometheus-values.yaml
-# Global settings
-fullnameOverride: "monitoring"
-
-# Prometheus configuration
-prometheus:
-  prometheusSpec:
-    retention: 15d
-    resources:
-      limits:
-        cpu: 500m
-        memory: 2Gi
-      requests:
-        cpu: 100m
-        memory: 512Mi
-    storageSpec:
-      volumeClaimTemplate:
-        spec:
-          accessModes: ["ReadWriteOnce"]
-          resources:
-            requests:
-              storage: 10Gi
-
-# Grafana configuration
-grafana:
-  enabled: true
-  adminPassword: "admin123"  # Change this for production
-  resources:
-    limits:
-      cpu: 200m
-      memory: 256Mi
-    requests:
-      cpu: 50m
-      memory: 128Mi
-  persistence:
-    enabled: true
-    size: 2Gi
-
-# AlertManager configuration
-alertmanager:
-  alertmanagerSpec:
-    resources:
-      limits:
-        cpu: 100m
-        memory: 256Mi
-      requests:
-        cpu: 10m
-        memory: 64Mi
-    storage:
-      volumeClaimTemplate:
-        spec:
-          accessModes: ["ReadWriteOnce"]
-          resources:
-            requests:
-              storage: 2Gi
-
-# Node Exporter
-nodeExporter:
-  enabled: true
-
-# Kube State Metrics
-kubeStateMetrics:
-  enabled: true
-
-# Disable components you might not need in staging
-kubeEtcd:
-  enabled: false
-kubeScheduler:
-  enabled: false
-kubeControllerManager:
-  enabled: false
\ No newline at end of file
diff --git a/scaleway/manifests/base/monitoring/values-monitoring.yaml b/scaleway/manifests/base/monitoring/values-monitoring.yaml
index 2bbd408..9e3c704 100644
--- a/scaleway/manifests/base/monitoring/values-monitoring.yaml
+++ b/scaleway/manifests/base/monitoring/values-monitoring.yaml
@@ -9,7 +9,7 @@ global:
 # Prometheus configuration
 prometheus:
   prometheusSpec:
-    retention: 30d
+    retention: 7d
     storageSpec:
       volumeClaimTemplate:
         spec:
@@ -17,21 +17,7 @@ prometheus:
           accessModes: ["ReadWriteOnce"]
           resources:
             requests:
-              storage: 50Gi
-
-    # External services monitoring (Scaleway managed services)
-    additionalScrapeConfigs:
-      - job_name: 'scaleway-redis'
-        static_configs:
-          - targets: ['redis-endpoint:6379']
-        metrics_path: /metrics
-        scrape_interval: 30s
-
-      - job_name: 'scaleway-postgresql'
-        static_configs:
-          - targets: ['postgres-endpoint:5432']
-        metrics_path: /metrics
-        scrape_interval: 30s
+              storage: 5Gi
 
     # Resource limits
     resources:
@@ -48,7 +34,7 @@ grafana:
   persistence:
     enabled: true
     storageClassName: scw-bssd
-    size: 10Gi
+    size: 2Gi
 
   # Resource limits
   resources:
@@ -97,7 +83,7 @@ alertmanager:
           accessModes: ["ReadWriteOnce"]
           resources:
             requests:
-              storage: 10Gi
+              storage: 1Gi
     resources:
       requests:
         memory: 128Mi

From 030d1b0e9078b0b3a80097117e7cceed4468c21c Mon Sep 17 00:00:00 2001
From: Josako <pieter.laroy@flow-it.net>
Date: Tue, 30 Sep 2025 14:58:08 +0200
Subject: [PATCH 2/2] - cleaning script for monitoring namespace

---
 scaleway/clean-monitoring.sh | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100755 scaleway/clean-monitoring.sh

diff --git a/scaleway/clean-monitoring.sh b/scaleway/clean-monitoring.sh
new file mode 100755
index 0000000..2546621
--- /dev/null
+++ b/scaleway/clean-monitoring.sh
@@ -0,0 +1,25 @@
+# 1. Verwijder alle ClusterRoles van vorige installatie
+kubectl delete clusterrole monitoring-grafana-clusterrole --ignore-not-found=true
+kubectl delete clusterrole monitoring-kube-prometheus-admission --ignore-not-found=true
+kubectl delete clusterrole monitoring-kube-prometheus-operator --ignore-not-found=true
+kubectl delete clusterrole monitoring-kube-prometheus-prometheus --ignore-not-found=true
+kubectl delete clusterrole monitoring-kube-state-metrics --ignore-not-found=true
+
+# 2. Verwijder ClusterRoleBindings
+kubectl delete clusterrolebinding monitoring-grafana-clusterrolebinding --ignore-not-found=true
+kubectl delete clusterrolebinding monitoring-kube-prometheus-admission --ignore-not-found=true
+kubectl delete clusterrolebinding monitoring-kube-prometheus-operator --ignore-not-found=true
+kubectl delete clusterrolebinding monitoring-kube-prometheus-prometheus --ignore-not-found=true
+kubectl delete clusterrolebinding monitoring-kube-state-metrics --ignore-not-found=true
+
+# 3. Verwijder eventuele webhook configurations
+kubectl delete mutatingwebhookconfiguration monitoring-kube-prometheus-admission --ignore-not-found=true
+kubectl delete validatingwebhookconfiguration monitoring-kube-prometheus-admission --ignore-not-found=true
+
+# 4. Check voor andere monitoring resources
+kubectl get clusterroles | grep monitoring
+kubectl get clusterrolebindings | grep monitoring
+
+# 5. Als er nog resources zijn, verwijder ze:
+kubectl get clusterroles | grep monitoring | awk '{print $1}' | xargs -r kubectl delete clusterrole
+kubectl get clusterrolebindings | grep monitoring | awk '{print $1}' | xargs -r kubectl delete clusterrolebinding