Health Checks Runbook

Health Checks Runbook

Operational procedures for verifying system health. Run these checks after deployments or when investigating issues.

Attic Cache

# HTTP health check
curl -s https://nix-cache.${KUBE_INGRESS_BASE_DOMAIN}/nix-cache-info
# Expected: StoreDir: /nix/store, WantMassQuery: 1, Priority: 40

# Pod status
kubectl get pods -n attic-cache -l app.kubernetes.io/name=attic

# API server logs (last 50 lines)
kubectl logs -n attic-cache -l app.kubernetes.io/name=attic --tail=50

# GC worker status
kubectl get pods -n attic-cache -l app.kubernetes.io/component=gc

Bazel Remote Cache

# gRPC health (from within cluster)
kubectl run --rm -it grpc-check --image=fullstorydev/grpcurl \
  -- grpcurl -plaintext bazel-cache.attic-cache.svc.cluster.local:9092 \
  grpc.health.v1.Health/Check

# Metrics endpoint
kubectl port-forward -n attic-cache svc/bazel-cache 8080:8080 &
curl -s localhost:8080/metrics | grep bazel_remote

Runners

# GitHub ARC runners
kubectl get pods -n arc-runners -o wide
kubectl get autoscalingrunnersets -n arc-runners

# GitLab runners
kubectl get pods -n gitlab-runners -o wide
kubectl get hpa -n gitlab-runners

# Runner health check script
./scripts/runner-health-check.sh

PostgreSQL (CNPG)

# Cluster status
kubectl get cluster -n attic-cache
# Expected: status.phase = "Cluster in healthy state"

# Replication lag
kubectl exec -n attic-cache -it <primary-pod> -- psql -c "SELECT * FROM pg_stat_replication;"

# Connection count
kubectl exec -n attic-cache -it <primary-pod> -- psql -c "SELECT count(*) FROM pg_stat_activity;"

MinIO (S3 Storage)

# Tenant status
kubectl get tenants -n attic-cache

# Storage usage
kubectl exec -n attic-cache -it <minio-pod> -- mc admin info local

Dashboard

# Pod status
kubectl get pods -n runner-dashboard

# HTTP health
curl -s https://dashboard.${KUBE_INGRESS_BASE_DOMAIN}/

Full Stack Health Check

# Run the automated health check script
./scripts/health-check.sh -u https://nix-cache.${KUBE_INGRESS_BASE_DOMAIN} \
  -n attic-cache -v

GloriousFlywheel