Files
eveAI/k8s/dev/setup-dev-cluster.sh
Josako 9c63ecb17f - Metrics service toegevoegd
- Applicatie services starten op, behalve eveai_chat_client
- Connectiviteit naar admin / eveai_app niet functioneel
2025-08-20 11:49:19 +02:00

495 lines
17 KiB
Bash
Executable File

#!/bin/bash
# Setup script voor EveAI Dev Kind Cluster
# File: setup-dev-cluster.sh
set -e
echo "🚀 Setting up EveAI Dev Kind Cluster..."
CLUSTER_NAME="eveai-dev-cluster"
# Colors voor output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function voor colored output
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Check if required tools are installed
check_prerequisites() {
print_status "Checking prerequisites..."
if ! command -v kind &> /dev/null; then
print_error "kind is not installed. Please install kind first."
echo "Install via: go install sigs.k8s.io/kind@latest"
exit 1
fi
if ! command -v kubectl &> /dev/null; then
print_error "kubectl is not installed. Please install kubectl first."
exit 1
fi
if ! command -v podman &> /dev/null; then
print_error "podman is not installed. Please install podman first."
exit 1
fi
if ! command -v envsubst &> /dev/null; then
print_error "envsubst is not installed. Please install envsubst first"
fi
print_success "All prerequisites are installed"
}
# Create host directories for persistent volumes
create_host_directories() {
print_status "Creating host directories for persistent storage..."
BASE_DIR="$HOME/k8s-data/dev"
directories=(
"$BASE_DIR/minio"
"$BASE_DIR/logs"
"$BASE_DIR/prometheus"
"$BASE_DIR/grafana"
"$BASE_DIR/certs"
)
for dir in "${directories[@]}"; do
if [ ! -d "$dir" ]; then
mkdir -p "$dir"
print_status "Created directory: $dir"
else
print_status "Directory already exists: $dir"
fi
done
# Set proper permissions
# chmod -R 755 "$BASE_DIR"
print_success "Host directories created and configured"
}
# Create Kind cluster
create_cluster() {
print_status "Creating Kind cluster..."
if kind get clusters | grep -q "eveai-dev-cluster"; then
print_warning "Cluster 'eveai-dev-cluster' already exists"
echo -n "Do you want to delete and recreate it? (y/N): "
read -r response
if [[ "$response" =~ ^[Yy]$ ]]; then
print_status "Deleting existing cluster..."
kind delete cluster --name eveai-dev-cluster
else
print_status "Using existing cluster"
return 0
fi
fi
KIND_CONFIG="kind-dev-cluster.yaml"
if [ ! -f "${KIND_CONFIG}" ]; then
print_error "Config '${KIND_CONFIG}' not found in $(pwd)"
exit 1
fi
print_status "Creating new Kind cluster with configuration..."
# Genereer expanded config met envsubst
EXPANDED_CONFIG="$(mktemp --suffix=.yaml)"
envsubst < "${KIND_CONFIG}" > "${EXPANDED_CONFIG}"
# Voorkeursmethode: start in user-scope met expliciete delegatie
if command -v systemd-run >/dev/null 2>&1; then
systemd-run --scope --user -p "Delegate=yes" \
env KIND_EXPERIMENTAL_PROVIDER=podman \
kind create cluster --name "${CLUSTER_NAME}" --config "${EXPANDED_CONFIG}"
else
# Fallback
print_warning "Start zonder systemd-run scope; kan mislukken bij ontbrekende delegatie."
kind create cluster --name "${CLUSTER_NAME}" --config "${EXPANDED_CONFIG}"
fi
# Cleanup temporary config
rm -f "${EXPANDED_CONFIG}"
# Wait for cluster to be ready
print_status "Waiting for cluster to be ready..."
kubectl wait --for=condition=Ready nodes --all --timeout=300s
# Update CA certificates in Kind node
if command -v podman &> /dev/null; then
podman exec eveai-dev-cluster-control-plane update-ca-certificates
podman exec eveai-dev-cluster-control-plane systemctl restart containerd
else
docker exec eveai-dev-cluster-control-plane update-ca-certificates
docker exec eveai-dev-cluster-control-plane systemctl restart containerd
fi
print_success "Kind cluster created successfully"
}
# Verify CRI status and functionality
verify_cri_status() {
print_status "Verifying CRI status..."
# Wait for services to stabilize
sleep 15
# Test CRI connectivity
if podman exec "${CLUSTER_NAME}-control-plane" crictl version &>/dev/null; then
print_success "CRI is functional"
# Show CRI version info
print_status "CRI version information:"
podman exec "${CLUSTER_NAME}-control-plane" crictl version
else
print_error "CRI is not responding - checking containerd logs"
podman exec "${CLUSTER_NAME}-control-plane" journalctl -u containerd --no-pager -n 20
print_error "Checking kubelet logs"
podman exec "${CLUSTER_NAME}-control-plane" journalctl -u kubelet --no-pager -n 10
return 1
fi
# Verify node readiness
print_status "Waiting for node to become Ready..."
local max_attempts=30
local attempt=0
while [ $attempt -lt $max_attempts ]; do
if kubectl get nodes | grep -q "Ready"; then
print_success "Node is Ready"
return 0
fi
attempt=$((attempt + 1))
print_status "Attempt $attempt/$max_attempts - waiting for node readiness..."
sleep 10
done
print_error "Node failed to become Ready within timeout"
kubectl get nodes -o wide
return 1
}
# Install Ingress Controller
install_ingress_controller() {
print_status "Installing NGINX Ingress Controller..."
# Install NGINX Ingress Controller for Kind
kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.8.1/deploy/static/provider/kind/deploy.yaml
# Wait for Ingress Controller to be ready
print_status "Waiting for Ingress Controller to be ready..."
kubectl wait --namespace ingress-nginx \
--for=condition=ready pod \
--selector=app.kubernetes.io/component=controller \
--timeout=300s
if [ $? -eq 0 ]; then
print_success "NGINX Ingress Controller installed and ready"
else
print_warning "Ingress Controller not ready, trying to label node..."
# Label the node for ingress (fallback for scheduling issues)
kubectl label node eveai-dev-cluster-control-plane ingress-ready=true --overwrite
# Wait again for Ingress Controller to be ready
print_status "Waiting for Ingress Controller after node labeling..."
kubectl wait --namespace ingress-nginx \
--for=condition=ready pod \
--selector=app.kubernetes.io/component=controller \
--timeout=300s
if [ $? -eq 0 ]; then
print_success "NGINX Ingress Controller ready after node labeling"
else
print_error "Failed to install or start Ingress Controller even after node labeling"
exit 1
fi
fi
# Verify Ingress Controller status
print_status "Ingress Controller status:"
kubectl get pods -n ingress-nginx
kubectl get services -n ingress-nginx
}
# Patch Ingress Controller Resources
patch_ingress_resources() {
print_status "Patching Ingress Controller resources..."
# Wait a moment for the deployment to be fully created
sleep 5
# Check if patch file exists
local patch_file="ingress-nginx-resources-patch.yaml"
if [[ ! -f "$patch_file" ]]; then
print_error "Patch file not found: $patch_file"
return 1
fi
# Patch the ingress-nginx-controller deployment with higher resource limits
print_status "Updating resource limits for ingress-nginx-controller using manifest file..."
# Apply patch with retry logic
local max_attempts=5
local attempt=1
local success=false
while [ $attempt -le $max_attempts ] && [ "$success" = false ]; do
print_status "Attempt $attempt/$max_attempts - patching ingress controller resources..."
if kubectl patch deployment ingress-nginx-controller -n ingress-nginx --patch-file "$patch_file"; then
print_success "Successfully patched ingress-nginx-controller resources"
success=true
else
if [ $attempt -lt $max_attempts ]; then
print_warning "Patch attempt $attempt failed, retrying in 5 seconds..."
sleep 5
attempt=$((attempt + 1))
else
print_error "Failed to patch ingress-nginx-controller resources after $max_attempts attempts"
return 1
fi
fi
done
# Wait for rollout to complete
print_status "Waiting for ingress controller rollout to complete..."
kubectl rollout status deployment/ingress-nginx-controller -n ingress-nginx --timeout=300s
if [ $? -eq 0 ]; then
print_success "Ingress Controller resource patch completed successfully"
# Verify the new resource settings
print_status "Verifying new resource settings..."
kubectl describe deployment ingress-nginx-controller -n ingress-nginx | grep -A 10 "Limits:\|Requests:" || true
else
print_error "Ingress Controller rollout failed"
return 1
fi
}
# Install Metrics Server
install_metrics_server() {
print_status "Installing Metrics Server..."
# Apply metrics server with Kind-specific configuration
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
# Check if patch file exists
local patch_file="metrics-server-patch.yaml"
if [[ ! -f "$patch_file" ]]; then
print_error "Patch file not found: $patch_file"
return 1
fi
# Patch metrics server for Kind (disable TLS verification)
print_status "Patching Metrics Server for Kind compatibility using manifest file..."
# Wait for metrics server deployment to exist
local max_wait=30
local wait_count=0
while ! kubectl get deployment metrics-server -n kube-system &> /dev/null; do
if [ $wait_count -ge $max_wait ]; then
print_error "Metrics server deployment not found after waiting"
return 1
fi
sleep 2
wait_count=$((wait_count + 1))
done
# Apply the patch
if kubectl patch deployment metrics-server -n kube-system --patch-file "$patch_file"; then
print_success "Successfully patched metrics-server configuration"
else
print_warning "Failed to patch metrics-server, but continuing..."
fi
# Wait for metrics server to be ready
print_status "Waiting for Metrics Server to be ready..."
kubectl wait --for=condition=available deployment/metrics-server -n kube-system --timeout=300s
if [ $? -eq 0 ]; then
print_success "Metrics Server installed and ready"
# Test metrics server
print_status "Testing metrics server..."
sleep 10 # Give metrics server time to collect initial metrics
if kubectl top nodes &> /dev/null; then
print_success "Metrics Server is working correctly"
else
print_warning "Metrics Server installed but may need more time to collect metrics"
fi
else
print_warning "Metrics Server installation completed but readiness check failed"
fi
}
# Apply Kubernetes manifests
apply_manifests() {
print_status "Applying Kubernetes manifests..."
# Apply base manifests in correct order (namespace.yaml handles namespace creation)
manifests=(
"namespace.yaml"
"persistent-volumes.yaml"
"config-secrets.yaml"
"network-policies.yaml"
)
for manifest in "${manifests[@]}"; do
if [ -f "$manifest" ]; then
print_status "Applying $manifest..."
# Apply with retry logic for race condition handling
local max_attempts=3
local attempt=1
local success=false
while [ $attempt -le $max_attempts ] && [ "$success" = false ]; do
if kubectl apply -f "$manifest"; then
print_success "Successfully applied: $manifest"
success=true
else
if [ $attempt -lt $max_attempts ]; then
print_warning "Attempt $attempt failed for $manifest, retrying in 3 seconds..."
sleep 3
attempt=$((attempt + 1))
else
print_error "Failed to apply $manifest after $max_attempts attempts"
return 1
fi
fi
done
else
print_warning "Manifest $manifest not found, skipping..."
fi
done
print_success "Base manifests applied successfully"
}
# Configure registry certificates and containerd
configure_registry_certificates() {
print_status "Configuring registry certificates and containerd..."
# Update CA certificates in the cluster
print_status "Updating CA certificates..."
kubectl debug node/eveai-dev-cluster-control-plane -it --image=busybox -- sh -c "
chroot /host update-ca-certificates 2>/dev/null || true
" 2>/dev/null || print_warning "Certificate update may have failed"
# Create containerd registry configuration directory
print_status "Creating containerd registry configuration..."
kubectl debug node/eveai-dev-cluster-control-plane -it --image=busybox -- sh -c "
chroot /host mkdir -p /etc/containerd/certs.d/registry.ask-eve-ai-local.com
" 2>/dev/null || print_warning "Failed to create containerd config directory"
# Configure registry hosts.toml
print_status "Configuring registry hosts.toml..."
kubectl debug node/eveai-dev-cluster-control-plane -it --image=busybox -- sh -c "
chroot /host sh -c 'cat > /etc/containerd/certs.d/registry.ask-eve-ai-local.com/hosts.toml << EOF
server = \"https://registry.ask-eve-ai-local.com\"
[host.\"https://registry.ask-eve-ai-local.com\"]
capabilities = [\"pull\", \"resolve\"]
ca = [\"/usr/local/share/ca-certificates/mkcert-ca.crt\"]
EOF'
" 2>/dev/null || print_warning "Failed to create hosts.toml"
# Restart containerd to apply configuration
print_status "Restarting containerd..."
kubectl debug node/eveai-dev-cluster-control-plane -it --image=busybox -- sh -c "
chroot /host systemctl restart containerd
" 2>/dev/null || print_warning "Failed to restart containerd"
print_success "Registry certificates and containerd configured"
}
# Verify cluster status
verify_cluster() {
print_status "Verifying cluster status..."
# Check nodes
print_status "Cluster nodes:"
kubectl get nodes
# Check namespaces
print_status "Namespaces:"
kubectl get namespaces
# Check persistent volumes
print_status "Persistent volumes:"
kubectl get pv
# Check if registry is accessible from cluster
print_status "Testing registry connectivity..."
if kubectl run test-registry --image=registry.ask-eve-ai-local.com/josakola/nginx:latest --dry-run=server &> /dev/null; then
print_success "Registry is accessible from cluster"
kubectl delete pod test-registry --ignore-not-found=true &> /dev/null || true
else
print_warning "Registry connectivity test failed - this might be expected if images aren't pushed yet"
fi
}
# Main execution
main() {
echo "=================================================="
echo "🏗️ EveAI Dev Kind Cluster Setup"
echo "=================================================="
check_prerequisites
create_host_directories
create_cluster
verify_cri_status
install_ingress_controller
patch_ingress_resources
install_metrics_server
apply_manifests
configure_registry_certificates
verify_cluster
echo ""
echo "=================================================="
print_success "EveAI Dev Kind Cluster setup completed!"
echo "=================================================="
echo ""
echo "📋 Next steps:"
echo "1. Deploy your application services using: ./deploy-all-services.sh"
echo "2. Access services via Ingress: http://minty.ask-eve-ai-local.com:3080"
echo ""
echo "🔧 Useful commands:"
echo " kubectl config current-context # Verify you're using the right cluster"
echo " kubectl get all -n eveai-dev # Check all resources in dev namespace"
echo " kubectl get ingress -n eveai-dev # Check Ingress resources"
echo " kind delete cluster --name eveai-dev-cluster # Delete cluster when done"
echo ""
echo "📊 Service Access (via Ingress):"
echo " - Main App: http://minty.ask-eve-ai-local.com:3080/admin/"
echo " - API: http://minty.ask-eve-ai-local.com:3080/api/"
echo " - Chat Client: http://minty.ask-eve-ai-local.com:3080/chat-client/"
echo " - Static Files: http://minty.ask-eve-ai-local.com:3080/static/"
}
# Run main function
main "$@"