Health Checks Runbook
Operational procedures for verifying system health. Run these checks after deployments or when investigating issues.
Attic Cache
# HTTP health check
curl -s https://nix-cache.${KUBE_INGRESS_BASE_DOMAIN}/nix-cache-info
# Expected: StoreDir: /nix/store, WantMassQuery: 1, Priority: 40
# Pod status
kubectl get pods -n attic-cache -l app.kubernetes.io/name=attic
# API server logs (last 50 lines)
kubectl logs -n attic-cache -l app.kubernetes.io/name=attic --tail=50
# GC worker status
kubectl get pods -n attic-cache -l app.kubernetes.io/component=gc
Bazel Remote Cache
# gRPC health (from within cluster)
kubectl run --rm -it grpc-check --image=fullstorydev/grpcurl \
-- grpcurl -plaintext bazel-cache.attic-cache.svc.cluster.local:9092 \
grpc.health.v1.Health/Check
# Metrics endpoint
kubectl port-forward -n attic-cache svc/bazel-cache 8080:8080 &
curl -s localhost:8080/metrics | grep bazel_remote
Runners
# GitHub ARC runners
kubectl get pods -n arc-runners -o wide
kubectl get autoscalingrunnersets -n arc-runners
# GitLab runners
kubectl get pods -n gitlab-runners -o wide
kubectl get hpa -n gitlab-runners
# Runner health check script
./scripts/runner-health-check.sh
PostgreSQL (CNPG)
# Cluster status
kubectl get cluster -n attic-cache
# Expected: status.phase = "Cluster in healthy state"
# Replication lag
kubectl exec -n attic-cache -it <primary-pod> -- psql -c "SELECT * FROM pg_stat_replication;"
# Connection count
kubectl exec -n attic-cache -it <primary-pod> -- psql -c "SELECT count(*) FROM pg_stat_activity;"
MinIO (S3 Storage)
# Tenant status
kubectl get tenants -n attic-cache
# Storage usage
kubectl exec -n attic-cache -it <minio-pod> -- mc admin info local
Dashboard
# Pod status
kubectl get pods -n runner-dashboard
# HTTP health
curl -s https://dashboard.${KUBE_INGRESS_BASE_DOMAIN}/
Full Stack Health Check
# Run the automated health check script
./scripts/health-check.sh -u https://nix-cache.${KUBE_INGRESS_BASE_DOMAIN} \
-n attic-cache -v