GloriousFlywheel

Health Checks Runbook

Operational procedures for verifying system health. Run these checks after deployments or when investigating issues.

Attic Cache

# HTTP health check
curl -s https://nix-cache.${KUBE_INGRESS_BASE_DOMAIN}/nix-cache-info
# Expected: StoreDir: /nix/store, WantMassQuery: 1, Priority: 40

# Pod status
kubectl get pods -n nix-cache -l app.kubernetes.io/name=attic

# API server logs (last 50 lines)
kubectl logs -n nix-cache -l app.kubernetes.io/name=attic --tail=50

# GC worker status
kubectl get pods -n nix-cache -l app.kubernetes.io/component=gc

Bazel Remote Cache

# gRPC health (from within cluster)
kubectl run --rm -it grpc-check --image=fullstorydev/grpcurl \
  -- grpcurl -plaintext bazel-cache.nix-cache.svc.cluster.local:9092 \
  grpc.health.v1.Health/Check

# Status endpoint through a bounded local port-forward
kubectl port-forward -n nix-cache svc/bazel-cache 18080:8080 &
curl -s localhost:18080/status

# Confirm S3-backed mode in recent logs
kubectl logs -n nix-cache deploy/bazel-cache --tail=20 | grep 'Using S3 backend'

Runners

# GitHub ARC runners
kubectl get pods -n arc-runners -o wide
kubectl get autoscalingrunnersets -n arc-runners

# GitLab runners
kubectl get pods -n gitlab-runners -o wide
kubectl get hpa -n gitlab-runners

# Runner health check script
./scripts/runner-health-check.sh

PostgreSQL (CNPG)

# Cluster status
kubectl get cluster -n nix-cache
# Expected: status.phase = "Cluster in healthy state"

# Replication lag
kubectl exec -n nix-cache -it <primary-pod> -- psql -c "SELECT * FROM pg_stat_replication;"

# Connection count
kubectl exec -n nix-cache -it <primary-pod> -- psql -c "SELECT count(*) FROM pg_stat_activity;"

RustFS (S3-Compatible Storage)

# Service and pod status
kubectl get deploy,pod,svc -n nix-cache | grep rustfs

# Storage usage
kubectl exec -n nix-cache deploy/attic-rustfs-openebs -- mc admin info local

Dashboard

# Pod status
kubectl get pods -n runner-dashboard

# HTTP health
curl -s https://dashboard.${KUBE_INGRESS_BASE_DOMAIN}/

Full Stack Health Check

# Run the automated health check script
./scripts/health-check.sh -u https://nix-cache.${KUBE_INGRESS_BASE_DOMAIN} \
  -n nix-cache -v