Health Checks Runbook
Operational procedures for verifying system health. Run these checks after deployments or when investigating issues.
Attic Cache
# HTTP health check
curl -s https://nix-cache.${KUBE_INGRESS_BASE_DOMAIN}/nix-cache-info
# Expected: StoreDir: /nix/store, WantMassQuery: 1, Priority: 40
# Pod status
kubectl get pods -n nix-cache -l app.kubernetes.io/name=attic
# API server logs (last 50 lines)
kubectl logs -n nix-cache -l app.kubernetes.io/name=attic --tail=50
# GC worker status
kubectl get pods -n nix-cache -l app.kubernetes.io/component=gc
Bazel Remote Cache
# gRPC health (from within cluster)
kubectl run --rm -it grpc-check --image=fullstorydev/grpcurl \
-- grpcurl -plaintext bazel-cache.nix-cache.svc.cluster.local:9092 \
grpc.health.v1.Health/Check
# Status endpoint through a bounded local port-forward
kubectl port-forward -n nix-cache svc/bazel-cache 18080:8080 &
curl -s localhost:18080/status
# Confirm S3-backed mode in recent logs
kubectl logs -n nix-cache deploy/bazel-cache --tail=20 | grep 'Using S3 backend'
Runners
# GitHub ARC runners
kubectl get pods -n arc-runners -o wide
kubectl get autoscalingrunnersets -n arc-runners
# GitLab runners
kubectl get pods -n gitlab-runners -o wide
kubectl get hpa -n gitlab-runners
# Runner health check script
./scripts/runner-health-check.sh
PostgreSQL (CNPG)
# Cluster status
kubectl get cluster -n nix-cache
# Expected: status.phase = "Cluster in healthy state"
# Replication lag
kubectl exec -n nix-cache -it <primary-pod> -- psql -c "SELECT * FROM pg_stat_replication;"
# Connection count
kubectl exec -n nix-cache -it <primary-pod> -- psql -c "SELECT count(*) FROM pg_stat_activity;"
RustFS (S3-Compatible Storage)
# Service and pod status
kubectl get deploy,pod,svc -n nix-cache | grep rustfs
# Storage usage
kubectl exec -n nix-cache deploy/attic-rustfs-openebs -- mc admin info local
Dashboard
# Pod status
kubectl get pods -n runner-dashboard
# HTTP health
curl -s https://dashboard.${KUBE_INGRESS_BASE_DOMAIN}/
Full Stack Health Check
# Run the automated health check script
./scripts/health-check.sh -u https://nix-cache.${KUBE_INGRESS_BASE_DOMAIN} \
-n nix-cache -v