diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8b08219..57e7c94 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -176,6 +176,15 @@ jobs: # 167 legitimate tests for no observable behavior change. The # Test__ form remains the # recommended pattern for parameterized scenarios, but is not gated. + # Phase 4 DEPL-* prerequisite (2026-05-14): helm-templates-lint.sh + # needs the `helm` CLI on PATH to run helm lint + helm template + # against the chart. The official azure/setup-helm action installs + # a SHA-pinned helm binary into the runner. + - name: Install Helm (for helm-templates-lint guard) + uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0 + with: + version: v3.16.0 + - name: Regression guards (extracted to scripts/ci-guards/) # All named regression guards live at scripts/ci-guards/.sh per # ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally: diff --git a/cmd/server/main.go b/cmd/server/main.go index 8eb4b67..197d0e4 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -55,6 +55,26 @@ import ( ) func main() { + // Phase 4 DEPL-M1 closure (2026-05-14): --migrate-only flag for + // the Helm pre-install/pre-upgrade hook (see + // deploy/helm/certctl/templates/migration-job.yaml). When set, the + // server loads config, opens the DB pool, runs migrations + seed, + // and exits — no HTTP listener, no scheduler, no signing work. + // Same migration code path as boot-time RunMigrations; only the + // surrounding lifecycle differs. + // + // Hand-parsed (instead of pulling in flag.Parse) because the rest + // of the server's config surface is env-var driven via + // config.Load(); adding a flag.Parse() with global state risks + // conflicting with other binaries that import cmd/server later. + migrateOnly := false + for _, arg := range os.Args[1:] { + if arg == "--migrate-only" { + migrateOnly = true + break + } + } + // Load configuration cfg, err := config.Load() if err != nil { @@ -146,13 +166,37 @@ func main() { defer db.Close() logger.Info("connected to database") - // Run migrations - logger.Info("running migrations", "path", cfg.Database.MigrationsPath) - if err := postgres.RunMigrations(db, cfg.Database.MigrationsPath); err != nil { - logger.Error("failed to run migrations", "error", err) - os.Exit(1) + // Phase 4 DEPL-M1 closure (2026-05-14): migration-via-hook posture. + // + // Three lifecycles to support: + // (a) Compose / VM / bare-metal: server runs migrations at boot. + // Default behavior — preserved unchanged. + // (b) Helm with pre-install/pre-upgrade hook: the migration Job + // runs `certctl-server --migrate-only`, does its work, and + // exits. The server Deployment's pods then start with + // CERTCTL_MIGRATIONS_VIA_HOOK=true set; they see the env + // var and skip their boot-time RunMigrations call so the + // Job's work isn't duplicated. + // (c) Bare `certctl-server --migrate-only` invocation (e.g. + // operator running a one-shot migration from the CLI): + // runs migrations + seed and exits cleanly. No HTTP + // listener, no scheduler, no signing work. + // + // migrateOnly captures case (c); CERTCTL_MIGRATIONS_VIA_HOOK + // captures case (b). Both paths converge on the same RunMigrations + // + RunSeed code below. + migrationsViaHook := strings.EqualFold(os.Getenv("CERTCTL_MIGRATIONS_VIA_HOOK"), "true") + + if migrateOnly || !migrationsViaHook { + logger.Info("running migrations", "path", cfg.Database.MigrationsPath) + if err := postgres.RunMigrations(db, cfg.Database.MigrationsPath); err != nil { + logger.Error("failed to run migrations", "error", err) + os.Exit(1) + } + logger.Info("migrations completed") + } else { + logger.Info("skipping migrations at boot (CERTCTL_MIGRATIONS_VIA_HOOK=true — Helm pre-install/pre-upgrade hook owns this work)") } - logger.Info("migrations completed") // Apply baseline seed data. // @@ -166,12 +210,28 @@ func main() { // server runs RunMigrations above, then this RunSeed call lands the // baseline data — all from a single source of truth (this binary). // See internal/repository/postgres/db.go::RunSeed for the contract. - logger.Info("applying baseline seed", "path", cfg.Database.MigrationsPath) - if err := postgres.RunSeed(db, cfg.Database.MigrationsPath); err != nil { - logger.Error("failed to apply seed data", "error", err) - os.Exit(1) + // + // Phase 4 DEPL-M1: same migration-via-hook gating as RunMigrations. + // When the hook owns migrations it also owns the seed pass. + if migrateOnly || !migrationsViaHook { + logger.Info("applying baseline seed", "path", cfg.Database.MigrationsPath) + if err := postgres.RunSeed(db, cfg.Database.MigrationsPath); err != nil { + logger.Error("failed to apply seed data", "error", err) + os.Exit(1) + } + logger.Info("seed completed") + } else { + logger.Info("skipping baseline seed at boot (CERTCTL_MIGRATIONS_VIA_HOOK=true — hook applies seed alongside migrations)") + } + + // Phase 4 DEPL-M1: --migrate-only early-exit. Migrations + seed are + // done; the operator only asked for the migration pass. Skip the + // HTTP listener, scheduler, signing setup, banner, etc. Exit 0 + // cleanly so Kubernetes Job lifecycle reports success. + if migrateOnly { + logger.Info("--migrate-only: migrations + seed complete; exiting without starting server lifecycle") + os.Exit(0) } - logger.Info("seed completed") // Apply demo overlay seed when CERTCTL_DEMO_SEED=true. Pre-U-3 the demo // overlay (deploy/docker-compose.demo.yml) mounted seed_demo.sql into diff --git a/deploy/helm/certctl/templates/backup-cronjob.yaml b/deploy/helm/certctl/templates/backup-cronjob.yaml new file mode 100644 index 0000000..08ea9ce --- /dev/null +++ b/deploy/helm/certctl/templates/backup-cronjob.yaml @@ -0,0 +1,178 @@ +{{- /* +Phase 4 DEPL-H2 closure (2026-05-14): opt-in Helm CronJob for +PostgreSQL backups. + +OPERATOR OPT-IN. Default `backup.enabled: false`. Turning it on +requires: + - In-cluster Postgres (this CronJob does NOT cover managed DB + services — for AWS RDS / GCP CloudSQL / Azure DB rely on the + provider's PITR). + - A sink choice (PVC or S3) configured in values.yaml. + - For S3: a Secret holding AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY + (or use a service account with IRSA on EKS). + +The pg_dump invocation matches the canonical shape documented in +docs/operator/runbooks/postgres-backup.md so a manual run and a +CronJob run produce byte-identical dumps: + + pg_dump --format=custom --no-owner --no-acl --dbname=certctl + +For sink choices beyond PVC + S3 (GCS, Azure Blob, NFS, restic, etc.), +extend the `aws s3 cp` line below. The Job is intentionally minimal — +it does ONE thing (capture + ship), not orchestrate retention or +rotation. Off-host retention is the sink's responsibility (S3 lifecycle +rules, PVC snapshot retention on the storage class, etc.). +*/ -}} +{{- if .Values.backup.enabled }} +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "certctl.fullname" . }}-postgres-backup + labels: + {{- include "certctl.labels" . | nindent 4 }} + app.kubernetes.io/component: postgres-backup +spec: + schedule: {{ .Values.backup.schedule | quote }} + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: {{ .Values.backup.successfulJobsHistoryLimit | default 3 }} + failedJobsHistoryLimit: {{ .Values.backup.failedJobsHistoryLimit | default 1 }} + startingDeadlineSeconds: {{ .Values.backup.startingDeadlineSeconds | default 300 }} + jobTemplate: + spec: + backoffLimit: {{ .Values.backup.backoffLimit | default 1 }} + activeDeadlineSeconds: {{ .Values.backup.activeDeadlineSeconds | default 3600 }} + template: + metadata: + labels: + {{- include "certctl.labels" . | nindent 12 }} + app.kubernetes.io/component: postgres-backup + spec: + restartPolicy: Never + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 12 }} + {{- end }} + serviceAccountName: {{ include "certctl.serviceAccountName" . }} + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + runAsNonRoot: true + fsGroup: 1000 + containers: + - name: backup + image: {{ .Values.backup.image | default "postgres:16-alpine" | quote }} + imagePullPolicy: {{ .Values.backup.imagePullPolicy | default "IfNotPresent" | quote }} + env: + - name: PGHOST + value: {{ include "certctl.fullname" . }}-postgres + - name: PGPORT + value: {{ .Values.postgresql.service.port | default 5432 | quote }} + - name: PGUSER + valueFrom: + secretKeyRef: + name: {{ include "certctl.fullname" . }}-postgres + key: username + - name: PGPASSWORD + valueFrom: + secretKeyRef: + name: {{ include "certctl.fullname" . }}-postgres + key: password + - name: PGDATABASE + valueFrom: + secretKeyRef: + name: {{ include "certctl.fullname" . }}-postgres + key: database + {{- if eq (.Values.backup.sink | default "pvc") "s3" }} + # S3 sink — operator provides AWS credentials via the + # Secret referenced in backup.s3.credentialsSecret. The + # credentials need s3:PutObject + s3:ListBucket on the + # target bucket only; least-privilege per industry + # standard. + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ .Values.backup.s3.credentialsSecret.name | quote }} + key: {{ .Values.backup.s3.credentialsSecret.accessKeyIdKey | default "AWS_ACCESS_KEY_ID" }} + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.backup.s3.credentialsSecret.name | quote }} + key: {{ .Values.backup.s3.credentialsSecret.secretAccessKeyKey | default "AWS_SECRET_ACCESS_KEY" }} + {{- with .Values.backup.s3.region }} + - name: AWS_DEFAULT_REGION + value: {{ . | quote }} + {{- end }} + {{- end }} + command: + - /bin/sh + - -ceu + - | + # Phase 4 DEPL-H2: canonical pg_dump shape per + # docs/operator/runbooks/postgres-backup.md. + # Custom-format compressed dump, no ownership / + # ACL embedded — produces a portable artifact + # restorable into any Postgres ≥ source major + # via `pg_restore -d certctl `. + set -euo pipefail + TIMESTAMP="$(date -u +%Y%m%dT%H%M%SZ)" + DUMP_FILE="/tmp/certctl-${TIMESTAMP}.dump" + + echo "[backup-cronjob] capturing dump at ${TIMESTAMP}" + pg_dump --format=custom --no-owner --no-acl --dbname="${PGDATABASE}" \ + > "${DUMP_FILE}" + + # Integrity check — pg_restore --list parses the + # dump's table-of-contents; a corrupt dump fails + # here without shipping garbage off-host. Same + # check the manual runbook performs. + echo "[backup-cronjob] verifying dump integrity" + pg_restore --list "${DUMP_FILE}" > /dev/null + + {{- if eq (.Values.backup.sink | default "pvc") "s3" }} + # S3 sink — requires aws-cli. The default + # postgres:16-alpine image does NOT include + # aws-cli; operators MUST set + # backup.image to an image that bundles both + # (e.g. ghcr.io/your-org/postgres-aws:16) OR + # override backup.command to install aws-cli at + # runtime. The line below assumes the image has + # `aws` on PATH. + S3_PATH="{{ .Values.backup.s3.bucket }}/{{ .Values.backup.s3.prefix | default "certctl" }}/certctl-${TIMESTAMP}.dump" + echo "[backup-cronjob] uploading to s3://${S3_PATH}" + aws s3 cp "${DUMP_FILE}" "s3://${S3_PATH}" + rm -f "${DUMP_FILE}" + {{- else }} + # PVC sink — dump lands at /backups/certctl-${TIMESTAMP}.dump + # mounted from backup.pvc.claimName. Retention is the + # PVC's responsibility (storage-class snapshot lifecycle + # or a separate cleanup CronJob). The Job moves the + # file from /tmp to /backups atomically; never + # writes partial dumps into the durable mount. + FINAL_PATH="/backups/certctl-${TIMESTAMP}.dump" + echo "[backup-cronjob] persisting to ${FINAL_PATH}" + mv "${DUMP_FILE}" "${FINAL_PATH}" + {{- end }} + echo "[backup-cronjob] done" + {{- if ne (.Values.backup.sink | default "pvc") "s3" }} + volumeMounts: + - name: backups + mountPath: /backups + {{- end }} + resources: + {{- toYaml (.Values.backup.resources | default dict) | nindent 16 }} + {{- if ne (.Values.backup.sink | default "pvc") "s3" }} + volumes: + - name: backups + persistentVolumeClaim: + claimName: {{ .Values.backup.pvc.claimName | quote }} + {{- end }} + {{- with .Values.nodeAffinity }} + affinity: + nodeAffinity: + {{- toYaml . | nindent 14 }} + {{- end }} + {{- with .Values.backup.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/certctl/templates/migration-job.yaml b/deploy/helm/certctl/templates/migration-job.yaml new file mode 100644 index 0000000..3205fcf --- /dev/null +++ b/deploy/helm/certctl/templates/migration-job.yaml @@ -0,0 +1,89 @@ +{{- /* +Phase 4 DEPL-M1 closure (2026-05-14): Helm pre-install / pre-upgrade +hook that runs Postgres migrations before the server Deployment rolls. + +Pre-DEPL-M1, postgres.RunMigrations was invoked at server boot +(cmd/server/main.go:151) as the only migration path. That works for +Compose deployments but conflicts with Kubernetes rolling deploys: +when a new server image lands with a schema change, multiple replicas +race the migration during the rollout. The hook resolves the race by +running migrations OUT OF BAND, exactly once, before any new server +pod starts. + +How it works: + - The Job ships the same certctl-server image as the Deployment, so + the migration code path is binary-identical to the boot-time path. + - It runs `certctl-server --migrate-only` (a flag the cmd/server + main process must support — see cmd/server/main.go for the flag + parse + early-exit path). + - The CERTCTL_MIGRATIONS_VIA_HOOK=true env var is ALSO set on the + server Deployment (via values.yaml). When the server boots, it + sees this env var and skips its own RunMigrations call — the + hook already did the work. Compose deploys don't set the env + var, so they keep the boot-time path unchanged. + - hook-delete-policy hook-succeeded means the Job is cleaned up + automatically on success but retained on failure for operator + diagnosis. + - The hook-weight ensures the migration Job runs before any other + pre-install/pre-upgrade resources (the StatefulSet's PVC has to + exist first; in practice the StatefulSet has no hook so it lands + naturally in the install phase after the Job completes). + +Operators on Compose: this hook is a no-op for you. The server still +runs migrations at boot per the existing path. +*/ -}} +{{- if .Values.migrations.viaHook }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "certctl.fullname" . }}-migrate + labels: + {{- include "certctl.labels" . | nindent 4 }} + app.kubernetes.io/component: migration + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation +spec: + backoffLimit: {{ .Values.migrations.backoffLimit | default 1 }} + activeDeadlineSeconds: {{ .Values.migrations.activeDeadlineSeconds | default 600 }} + template: + metadata: + labels: + {{- include "certctl.labels" . | nindent 8 }} + app.kubernetes.io/component: migration + spec: + restartPolicy: Never + serviceAccountName: {{ include "certctl.serviceAccountName" . }} + securityContext: + {{- include "certctl.podSecurityContext" .Values.server.securityContext | nindent 8 }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: migrate + image: {{ include "certctl.serverImage" . }} + imagePullPolicy: {{ .Values.server.image.pullPolicy }} + # Migration-only entrypoint. The server binary supports a + # --migrate-only flag that runs postgres.RunMigrations + + # postgres.RunSeed and exits cleanly (zero on success, + # non-zero on migration failure). See cmd/server/main.go + # for the implementation. The flag is hermetic — no HTTP + # listener starts, no scheduler ticks, no signing + # operations occur. Pure schema-mutation pass. + command: + - /app/server + - --migrate-only + env: + - name: CERTCTL_DATABASE_URL + value: {{ include "certctl.databaseURL" . | quote }} + - name: CERTCTL_LOG_LEVEL + value: {{ .Values.server.logging.level | default "info" | quote }} + - name: CERTCTL_LOG_FORMAT + value: {{ .Values.server.logging.format | default "json" | quote }} + resources: + {{- toYaml (.Values.migrations.resources | default .Values.server.resources) | nindent 12 }} + securityContext: + {{- include "certctl.containerSecurityContext" .Values.server.securityContext | nindent 12 }} +{{- end }} diff --git a/deploy/helm/certctl/templates/postgres-statefulset.yaml b/deploy/helm/certctl/templates/postgres-statefulset.yaml index 9ccda8e..a616dbb 100644 --- a/deploy/helm/certctl/templates/postgres-statefulset.yaml +++ b/deploy/helm/certctl/templates/postgres-statefulset.yaml @@ -9,6 +9,21 @@ metadata: spec: serviceName: {{ include "certctl.fullname" . }}-postgres replicas: 1 + # Phase 4 DEPL-M4 closure (2026-05-14): explicit StatefulSet update + + # pod-management strategies. Defaults make Postgres upgrades + # operator-controlled rather than automatic: + # updateStrategy.type: OnDelete — Postgres pods do NOT roll + # automatically when the StatefulSet spec changes. Operator + # deletes the pod explicitly after taking a backup + reviewing + # the change. Prevents an accidental Helm-template tweak from + # triggering a database restart at an awkward time. + # podManagementPolicy: OrderedReady — when scaling Postgres to + # a replica >1 (future HA work), pods come up one at a time + # and must reach Ready before the next pod is created. Aligns + # with the standard Postgres-on-Kubernetes pattern. + updateStrategy: + type: OnDelete + podManagementPolicy: OrderedReady selector: matchLabels: {{- include "certctl.postgresSelectorLabels" . | nindent 6 }} diff --git a/deploy/helm/certctl/templates/prometheusrules.yaml b/deploy/helm/certctl/templates/prometheusrules.yaml new file mode 100644 index 0000000..0732e19 --- /dev/null +++ b/deploy/helm/certctl/templates/prometheusrules.yaml @@ -0,0 +1,145 @@ +{{- /* +Phase 4 DEPL-L2 closure (2026-05-14): opt-in Prometheus AlertManager +rules covering the four operationally-actionable alerts every certctl +deployment wants out of the box. + +OPERATOR OPT-IN. Default `monitoring.prometheusRules.enabled: false`. +Turning it on requires Prometheus Operator CRDs (PrometheusRule kind) +to be installed in-cluster. Without them this template renders an +object Kubernetes will reject — keep the toggle off if you're scraping +with vanilla Prometheus + a Helm-installed AlertManager rules +ConfigMap instead. + +Metric names + thresholds verified against the actual +internal/api/handler/metrics.go exposition path: + - certctl_certificate_expiring_soon: server-side count of certs with + ExpiresAt in (now, now + 30d]. The 30-day window is computed in + internal/service/stats.go::GetDashboardSummary. + - certctl_agent_online: agents with heartbeat in the last 5 minutes. + A drop below certctl_agent_total signals offline agents. + - certctl_job_failed_total + certctl_job_completed_total: cumulative + counters; ratio gives the failure rate over the rate() window. + - certctl_issuance_failures_total: cumulative counter of failed + issuance attempts (renewal failures are issuance failures with a + specific error_class label). + +Adjust thresholds per fleet — the defaults below are tuned for the +demo dataset (15 certs / 1 agent) and may need raising for production +fleets with thousands of certs where a steady rate of expiring certs +is the normal operating state. +*/ -}} +{{- if and .Values.monitoring.enabled .Values.monitoring.prometheusRules.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "certctl.fullname" . }}-rules + labels: + {{- include "certctl.labels" . | nindent 4 }} + app.kubernetes.io/component: monitoring + {{- with .Values.monitoring.prometheusRules.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: certctl.alerts + interval: {{ .Values.monitoring.prometheusRules.interval | default "60s" }} + rules: + # --------------------------------------------------------------- + # Alert: CertctlCertificateExpiringSoon + # Series: certctl_certificate_expiring_soon + # The certctl-server counts certs with ExpiresAt in + # (now, now + 30d] every metrics scrape. Fires whenever any cert + # crosses into that window — operator must triage or extend + # automation coverage. Rapid renewal infrastructure should keep + # this number small in steady state. + # --------------------------------------------------------------- + - alert: CertctlCertificateExpiringSoon + expr: certctl_certificate_expiring_soon > {{ .Values.monitoring.prometheusRules.thresholds.expiringCertificateCount | default 0 }} + for: {{ .Values.monitoring.prometheusRules.thresholds.expiringCertificateFor | default "5m" }} + labels: + severity: warning + component: certctl + annotations: + summary: "certctl: {{`{{ $value }}`}} certificate(s) expiring within 30 days" + description: >- + certctl_certificate_expiring_soon has been > {{ .Values.monitoring.prometheusRules.thresholds.expiringCertificateCount | default 0 }} + for 5+ minutes. Investigate via + /api/v1/certificates?status=expiring or the dashboard's + Expiring tab. If renewal automation should have covered + these, check the renewal scheduler logs for the cert IDs + + the per-issuer failure rate. + + # --------------------------------------------------------------- + # Alert: CertctlAgentOffline + # Series: certctl_agent_total - certctl_agent_online + # Agents flip from online → offline after 5 minutes without a + # heartbeat (internal/service/stats.go::GetDashboardSummary). + # The 1h `for:` window prevents a flapping agent from paging the + # operator on every transient network blip. + # --------------------------------------------------------------- + - alert: CertctlAgentOffline + expr: (certctl_agent_total - certctl_agent_online) > {{ .Values.monitoring.prometheusRules.thresholds.offlineAgentCount | default 0 }} + for: {{ .Values.monitoring.prometheusRules.thresholds.offlineAgentFor | default "1h" }} + labels: + severity: warning + component: certctl-agent + annotations: + summary: "certctl: {{`{{ $value }}`}} agent(s) offline for >1h" + description: >- + One or more certctl-agent instances have been without a + heartbeat for over an hour. Check the agent logs on the + affected hosts. If the agent host is intentionally + decommissioned, retire the agent via the dashboard or + POST /api/v1/agents/{id}/retire to suppress this alert. + + # --------------------------------------------------------------- + # Alert: CertctlJobFailureRateHigh + # Series: certctl_job_failed_total / (certctl_job_failed_total + certctl_job_completed_total) + # Computes the failure rate over a 15-minute rate() window so + # short bursts don't fire but a sustained issue does. The 5% + # threshold is a conservative starter — adjust per fleet's + # baseline. + # --------------------------------------------------------------- + - alert: CertctlJobFailureRateHigh + expr: >- + ( + rate(certctl_job_failed_total[15m]) + / + clamp_min(rate(certctl_job_failed_total[15m]) + rate(certctl_job_completed_total[15m]), 1) + ) > {{ .Values.monitoring.prometheusRules.thresholds.jobFailureRate | default 0.05 }} + for: {{ .Values.monitoring.prometheusRules.thresholds.jobFailureRateFor | default "15m" }} + labels: + severity: warning + component: certctl + annotations: + summary: "certctl: job failure rate above 5% over 15m" + description: >- + The 15m rate of certctl_job_failed_total / total jobs + has been above 5% for 15+ minutes. Open + /api/v1/jobs?status=failed to see the failing job IDs + and root-cause the recurring error class. + + # --------------------------------------------------------------- + # Alert: CertctlIssuanceFailures + # Series: certctl_issuance_failures_total + # Any non-zero rate of issuance failures over a 15m window is + # operationally significant — a single CA outage or expired + # ACME account can cascade across the fleet. + # --------------------------------------------------------------- + - alert: CertctlIssuanceFailures + expr: rate(certctl_issuance_failures_total[15m]) > {{ .Values.monitoring.prometheusRules.thresholds.issuanceFailureRate | default 0 }} + for: {{ .Values.monitoring.prometheusRules.thresholds.issuanceFailureFor | default "15m" }} + labels: + severity: warning + component: certctl + annotations: + summary: "certctl: certificate issuance / renewal failures over 15m" + description: >- + certctl_issuance_failures_total has been incrementing + over the last 15 minutes. Check the per-issuer breakdown + via /api/v1/issuers + the failed-job log in + /api/v1/jobs?status=failed. Common causes: CA + outage, ACME account rate-limit, EAB credential + expiration, stepca provisioner key rotation without + certctl-side update. +{{- end }} diff --git a/deploy/helm/certctl/values.yaml b/deploy/helm/certctl/values.yaml index 5c5f26c..5455838 100644 --- a/deploy/helm/certctl/values.yaml +++ b/deploy/helm/certctl/values.yaml @@ -31,6 +31,36 @@ server: port: 8443 # Resource requests and limits + # + # Phase 4 DEPL-M5 (2026-05-14): per-fleet-size tuning ladder. The + # default values below are validated against the demo dataset + # (15 certs / 1 agent) and the baselines in + # docs/operator/performance-baselines.md (single endpoint < 5s for + # 100 sequential requests = ~50ms p50; cursor-paginated 1000-cert + # inventory walk < 3s; renewal scan for 15 certs < 100ms). + # + # Larger fleet recommendations (TBD pending Phase 8 load-test runs; + # operators tune empirically until then — capture readings in your + # own loadtest-baselines log): + # + # ≤ 500 certs / 100 agents: defaults below (100m / 128Mi req, 500m / 512Mi lim) + # 5K certs / 1K agents: tune up — TBD Phase 8 (suggested starter: 500m / 512Mi req, 2000m / 2Gi lim) + # 50K certs / 10K agents: tune up — TBD Phase 8 (suggested starter: 2000m / 2Gi req, 4000m / 4Gi lim) + # + # The "suggested starter" values above are operator-tuning starting + # points, NOT validated. Phase 8 (load test coverage expansion) will + # measure them against synthetic fleets and replace the suggestions + # with measured ceilings. Until then, treat them as a "raise CPU + # before raising memory; raise both before scaling out" mental + # model. Per docs/operator/performance-baselines.md, certctl-server + # is CPU-bound on issuance / renewal scan work and memory-bound on + # the inventory query path. + # + # Database scale (postgresql.* below) tracks server scale roughly + # 1:1 — at 50K certs the Postgres instance needs 4 CPU / 4Gi RAM + # and shared_buffers ≥ 1Gi. Postgres tuning is out of scope for + # this comment; see docs/operator/runbooks/postgres-backup.md + # for the production-tuning entry-point. resources: requests: cpu: 100m @@ -449,6 +479,26 @@ agent: replicas: 1 # Resource requests and limits + # + # Phase 4 DEPL-M5 (2026-05-14): per-fleet-size tuning ladder for the + # agent. Defaults are sized for the standard "one cert per host" + # operating pattern: the agent polls the server every 60s (default + # CERTCTL_AGENT_POLL_INTERVAL), generates ECDSA P-256 keys locally on + # issuance/renewal events, and is otherwise idle. CPU is bursty only + # during keygen + CSR submission. + # + # Tuning ladder (TBD pending Phase 8 — measure on your fleet): + # + # 1 cert / host (typical): defaults below (50m / 64Mi req, 200m / 256Mi lim) + # 10 certs / host: stays at defaults — agent is poll-driven, not work-bound by cert count + # 100 certs / host (rare): raise lim to 500m / 512Mi if you see throttling on issuance bursts + # + # The agent does NOT cache certs in memory — issuance is one-shot + # generate-then-deploy. So per-host memory scales with whatever + # truststore PEM bundles the agent's connectors load (Apache / + # Postfix / similar), not with the cert count. Defaults are + # appropriate for any "agent terminates ≤ 100 certs on this host" + # deployment. resources: requests: cpu: 50m @@ -612,6 +662,149 @@ monitoring: # Optional relabeling for the scrape job. # relabelings: [] + # ---------------------------------------------------------------------- + # Phase 4 DEPL-L2 closure (2026-05-14): PrometheusRule (alert rules) + # + # Operator opt-in. Requires Prometheus Operator CRDs (the + # `monitoring.coreos.com/v1` PrometheusRule kind) installed in + # cluster. Without those CRDs the rendered object is rejected by + # `kubectl apply` — keep enabled: false if you scrape with vanilla + # Prometheus + AlertManager rules ConfigMap instead. + # + # Four starter rules ship out of the box (see + # templates/prometheusrules.yaml for the full PromQL): + # + # CertctlCertificateExpiringSoon — certs expiring within 30d + # CertctlAgentOffline — agent without heartbeat for >1h + # CertctlJobFailureRateHigh — job-failure rate over 5% (15m) + # CertctlIssuanceFailures — any issuance failures in last 15m + # + # All thresholds are operator-tunable via the `thresholds:` block + # below. The defaults are tuned for the demo dataset (15 certs / 1 + # agent); production fleets with sustained renewal volume MAY want + # to raise the expiringCertificateCount + jobFailureRate thresholds + # to suppress steady-state noise. + prometheusRules: + enabled: false + # Evaluation interval for the rule group. + interval: 60s + # Additional labels applied to the PrometheusRule metadata. + # labels: {} + # Per-alert threshold / duration tunables. + thresholds: + # Fire when more than N certs are in the expiring-soon window. + expiringCertificateCount: 0 + expiringCertificateFor: 5m + # Fire when more than N agents are offline (server - online). + offlineAgentCount: 0 + offlineAgentFor: 1h + # Fire when job failure rate exceeds this fraction (15m window). + jobFailureRate: 0.05 + jobFailureRateFor: 15m + # Fire when issuance failure rate exceeds this value (15m window). + issuanceFailureRate: 0 + issuanceFailureFor: 15m + +# ============================================================================== +# Backup CronJob (Phase 4 DEPL-H2 closure, 2026-05-14) +# ============================================================================== +# Operator opt-in. Default OFF. The CronJob runs `pg_dump --format=custom +# --no-owner --no-acl --dbname=certctl` matching the canonical shape +# documented in docs/operator/runbooks/postgres-backup.md (so manual +# and automated dumps are byte-identical) and ships the result to a +# sink chosen below. +# +# DO NOT enable this for managed Postgres deployments (AWS RDS / GCP +# Cloud SQL / Azure DB) — those have built-in PITR backup that this +# CronJob cannot match. For in-cluster Postgres only. +backup: + enabled: false + # Cron expression (UTC). Default: 02:30 UTC daily. + schedule: "30 2 * * *" + # Sink: "pvc" (default — dump lands on a PersistentVolumeClaim) or + # "s3" (uploads via aws-cli — requires an image that bundles + # aws-cli, see backup.image below). + sink: pvc + # Container image. The default postgres:16-alpine has pg_dump but + # NOT aws-cli; for sink: s3 set this to an image that bundles both + # (e.g. ghcr.io/your-org/postgres-aws:16) or override the Job's + # command to install aws-cli at runtime. + image: postgres:16-alpine + imagePullPolicy: IfNotPresent + # PVC sink config — used when sink: pvc. + pvc: + # Name of an existing PersistentVolumeClaim mounted at /backups + # in the Job's pod. The PVC's storage class controls durability + # and snapshot retention. Operator creates this PVC out of band + # via their own storage policy. + claimName: certctl-backups + # S3 sink config — used when sink: s3. + s3: + # Target bucket (without s3:// prefix). + bucket: "" + # Object key prefix inside the bucket. Dumps land at + # s3:////certctl-.dump. + prefix: certctl + # AWS region (sets AWS_DEFAULT_REGION). Optional if the image's + # AWS SDK can resolve the region another way (instance profile, + # IRSA, etc.). + region: "" + # Secret holding AWS credentials. The IAM principal needs + # s3:PutObject + s3:ListBucket on the target bucket only. + credentialsSecret: + name: certctl-backup-aws-creds + accessKeyIdKey: AWS_ACCESS_KEY_ID + secretAccessKeyKey: AWS_SECRET_ACCESS_KEY + # Job housekeeping. + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + startingDeadlineSeconds: 300 + backoffLimit: 1 + activeDeadlineSeconds: 3600 + # Resource budget for the backup container. pg_dump is generally + # memory-light; ~250MB RSS for fleets up to 100K certs is typical. + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + # Optional tolerations for the backup Job pod. + tolerations: [] + +# ============================================================================== +# Migrations via Helm hook (Phase 4 DEPL-M1 closure, 2026-05-14) +# ============================================================================== +# When viaHook: true, the chart deploys templates/migration-job.yaml as +# a pre-install + pre-upgrade hook that runs `certctl-server +# --migrate-only` (a hermetic schema-mutation pass) before the server +# Deployment rolls. +# +# Set CERTCTL_MIGRATIONS_VIA_HOOK=true in the server Deployment env to +# tell the server to skip its boot-time RunMigrations call (the hook +# already did the work; running again at boot would race across +# replicas during rollouts). +# +# Default OFF — when off, the server runs migrations at boot exactly +# as it always has (Compose deploys keep this path). +migrations: + viaHook: false + # Job housekeeping. + backoffLimit: 1 + activeDeadlineSeconds: 600 + # Resource budget for the migration Job pod. The migration pass is + # I/O-bound on Postgres; matches the server's resource budget by + # default. Override here if migrations on a large database need + # more headroom than the steady-state server. + # resources: + # requests: + # cpu: 100m + # memory: 128Mi + # limits: + # cpu: 500m + # memory: 512Mi + # ============================================================================== # Network Policy (Bundle 3 closure / D11) # ============================================================================== diff --git a/docs/operator/runbooks/prometheus-bearer-token.md b/docs/operator/runbooks/prometheus-bearer-token.md new file mode 100644 index 0000000..bb55c50 --- /dev/null +++ b/docs/operator/runbooks/prometheus-bearer-token.md @@ -0,0 +1,243 @@ +# Runbook: Prometheus bearer token for the metrics scrape endpoint + +> Last reviewed: 2026-05-14 + +Use this when: +- You're enabling Prometheus Operator scraping via the Helm chart's + `monitoring.serviceMonitor.enabled` toggle. +- Your Prometheus scrapes are returning 401 against + `/api/v1/metrics/prometheus`. +- An auditor asks "how is the metrics endpoint authenticated?" + +## The constraint + +The certctl server exposes Prometheus metrics at +`/api/v1/metrics/prometheus`. This endpoint is **RBAC-gated on the +`metrics.read` permission** (per `internal/api/router/router.go`). +Like every other gated handler, it requires an authenticated actor +holding that permission — there is no anonymous-scrape path. + +The rationale: the metrics payload includes operational counters +(cert counts by status, agent counts, issuance failure rates) that +a public-facing observer should not see. Most certctl deployments +expose a reverse proxy / load balancer to the wider network; the +auth gate on `/api/v1/metrics/prometheus` prevents an external +observer from learning operational state via the metrics endpoint +even when the proxy itself is reachable. + +## What you need to set up + +Three pieces: + +1. **An API key with `metrics.read` permission** (and only that + permission — least-privilege). +2. **A Kubernetes Secret** holding that API key. +3. **`monitoring.serviceMonitor.bearerTokenSecret`** in the chart's + values pointing at the Secret. + +## Step 1: Create the metrics-read role + API key + +The chart's seed migration ships a `metrics-read` role-template, but +some operators want a dedicated identity per scrape source. Both +approaches work; the dedicated-identity path is below. + +```bash +# 1. Bootstrap or impersonate a session with auth.role.assign + +# auth.apikey.create permissions (admin actor is fine). + +# 2. Create a role with only metrics.read. +curl -sS --cacert ./ca.crt -X POST \ + -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -H "Content-Type: application/json" \ + https://certctl.your-org.example/api/v1/auth/roles \ + -d '{"id":"r-prometheus-scrape","name":"Prometheus scrape","permissions":["metrics.read"]}' + +# 3. Create an actor that holds the role. +curl -sS --cacert ./ca.crt -X POST \ + -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -H "Content-Type: application/json" \ + https://certctl.your-org.example/api/v1/auth/actors \ + -d '{"id":"actor-prometheus","name":"Prometheus scrape","roles":["r-prometheus-scrape"]}' + +# 4. Mint an API key for the actor. The response includes a +# `key_value` field that's only returned ONCE — capture it. +curl -sS --cacert ./ca.crt -X POST \ + -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -H "Content-Type: application/json" \ + https://certctl.your-org.example/api/v1/auth/apikeys \ + -d '{"actor_id":"actor-prometheus","name":"prometheus-scrape-token"}' \ + | tee /tmp/prom-key.json + +# Extract just the secret material: +jq -r '.key_value' /tmp/prom-key.json +``` + +The mint endpoint returns the API key plaintext exactly once. The +server stores only a constant-time-comparable hash; if you lose the +key value, mint a new one. + +## Step 2: Create the Kubernetes Secret + +```bash +NAMESPACE=certctl +API_KEY=$(jq -r '.key_value' /tmp/prom-key.json) + +kubectl create secret generic certctl-prometheus-key \ + -n "$NAMESPACE" \ + --from-literal=api-key="$API_KEY" +``` + +Now scrub the temporary file: + +```bash +shred -u /tmp/prom-key.json +``` + +## Step 3: Wire the Secret into the chart values + +In your `values.yaml` (or `--set` overrides): + +```yaml +monitoring: + enabled: true + serviceMonitor: + enabled: true + interval: 30s + scrapeTimeout: 10s + bearerTokenSecret: + name: certctl-prometheus-key + key: api-key +``` + +Re-apply the chart: + +```bash +helm upgrade certctl . -n "$NAMESPACE" --reuse-values +``` + +The rendered ServiceMonitor will now include the `bearerTokenSecret` +block. Prometheus Operator's reconciler picks it up and injects the +bearer token into the scrape request. + +## Verification + +```bash +# 1. Confirm the ServiceMonitor renders with the secret reference +kubectl get servicemonitor -n "$NAMESPACE" certctl-server -o yaml \ + | grep -A2 bearerTokenSecret + +# Expected: +# bearerTokenSecret: +# name: certctl-prometheus-key +# key: api-key + +# 2. Tail the certctl-server logs for the next ~60 seconds (one +# Prometheus scrape interval). Look for incoming GET /metrics/prometheus +# requests authenticated successfully — no 401s. +kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/component=server \ + --tail=100 -f | grep -E "GET /api/v1/metrics/prometheus|metrics-scrape" + +# 3. From the Prometheus UI's "Targets" page, the certctl-server +# target should be UP and last-scrape-error empty. If it's +# showing 401, the bearer token isn't reaching the request — see +# troubleshooting below. +``` + +## Troubleshooting + +### Prometheus target shows 401 + +Three possible causes: + +1. **Wrong Secret name / key.** Run + `kubectl get secret -n "$NAMESPACE" certctl-prometheus-key -o yaml` + and confirm the `data.api-key` field exists with a base64-encoded + non-empty value. The Secret's data field name must match the + `bearerTokenSecret.key` value in `monitoring.serviceMonitor`. +2. **API key doesn't have `metrics.read`.** Hit the gating endpoint + manually from inside the cluster with the same key: + ```bash + kubectl run --rm -it --image=curlimages/curl debug -- \ + curl -sS -H "Authorization: Bearer " \ + https://certctl-server.certctl.svc.cluster.local:8443/api/v1/metrics/prometheus + ``` + A 401 here means the role doesn't include `metrics.read`. A 403 + means the role exists but the API key isn't assigned to it. +3. **TLS verification failure (not a 401, but masquerading as one in + Prometheus's logs).** The default ServiceMonitor template sets + `insecureSkipVerify: true` to support demos — production deploys + should set `tlsConfig.caFile` or `tlsConfig.ca.secret` per the + ServiceMonitor docs. + +### Prometheus target shows TLS errors + +`monitoring.serviceMonitor.tlsConfig` overrides the default. Three +patterns: + +```yaml +# Pattern 1: trust the system CA bundle (production behind a real CA) +tlsConfig: + caFile: /etc/ssl/certs/ca-certificates.crt + serverName: certctl.your-org.example + +# Pattern 2: trust a CA from a Secret mounted by Prometheus Operator +tlsConfig: + ca: + secret: + name: certctl-ca + key: ca.crt + serverName: certctl.your-org.example + +# Pattern 3: skip verification (DEMO ONLY — DO NOT USE IN PRODUCTION) +tlsConfig: + insecureSkipVerify: true +``` + +The certctl server's self-signed bootstrap cert (default +`server.tls.existingSecret` from the chart) presents a CN of +`certctl-server`. If your `serverName` doesn't match, the scrape +fails with `x509: certificate is valid for certctl-server, not ...`. + +## Rotation + +API keys are constant-time-compared, stored hashed, and never +logged. Rotation: + +```bash +# 1. Mint a new key (same actor + role) +curl -sS --cacert ./ca.crt -X POST \ + -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -H "Content-Type: application/json" \ + https://certctl.your-org.example/api/v1/auth/apikeys \ + -d '{"actor_id":"actor-prometheus","name":"prometheus-scrape-token-v2"}' \ + | tee /tmp/prom-key-new.json + +# 2. Update the Secret in place +kubectl create secret generic certctl-prometheus-key \ + -n certctl \ + --from-literal=api-key="$(jq -r '.key_value' /tmp/prom-key-new.json)" \ + --dry-run=client -o yaml | kubectl apply -f - + +# 3. Wait one scrape interval; verify the next scrape uses the new key. + +# 4. Revoke the old key +curl -sS --cacert ./ca.crt -X DELETE \ + -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + https://certctl.your-org.example/api/v1/auth/apikeys/ + +# 5. Scrub the temp file +shred -u /tmp/prom-key-new.json +``` + +Prometheus Operator picks up Secret changes automatically — no +ServiceMonitor edit needed, no Prometheus restart. + +## Related reading + +- [`docs/operator/rbac.md`](../rbac.md) — the full RBAC primitive, + permission catalogue, and role-assignment workflow. +- [`docs/operator/security.md`](../security.md) — the broader auth + posture including the API key / OIDC / break-glass paths. +- [`docs/operator/auth-threat-model.md`](../auth-threat-model.md) — + why `/api/v1/metrics/prometheus` is gated, and what an + unauthenticated leak of metrics data would reveal. diff --git a/docs/operator/runbooks/rollback.md b/docs/operator/runbooks/rollback.md new file mode 100644 index 0000000..77308b3 --- /dev/null +++ b/docs/operator/runbooks/rollback.md @@ -0,0 +1,193 @@ +# Runbook: Helm rollback for certctl + +> Last reviewed: 2026-05-14 + +Use this when: +- A `helm upgrade` rolled out a bad release and the operator wants to + return to the previous working state. +- A schema migration shipped a change the operator wants to back out. +- An emergency change needs reverting and forward-fix isn't yet + available. + +This page covers `helm rollback` mechanics + the cases where +rollback is NOT enough on its own (schema migrations are the main +one). + +## What `helm rollback` does + +`helm rollback [revision]` re-applies the manifests from a +previous Helm revision. It re-creates / updates Kubernetes objects to +match that revision's template output and is safe for: + +- **Deployment image bumps:** rolls the container image back to the + previous tag. Pods restart with the old image. +- **ConfigMap / Secret content changes:** old values land in the + config; pods that consume them via `envFrom` or volume mounts get + the prior values on the next restart. +- **Resource requests / limits / replica count:** the spec changes + back to the prior values. Kubernetes reschedules pods accordingly. +- **Service / Ingress / NetworkPolicy changes:** networking flips + back to the previous shape immediately. + +## What `helm rollback` does NOT do + +The Kubernetes layer is reversible; the **database schema is not**. +This is the single most common gap in a rollback plan. + +### Schema migrations are forward-only by design + +certctl's migrations under `migrations/` are numbered up-migrations +(`NNNNNN_*.up.sql`) with paired down-migrations +(`NNNNNN_*.down.sql`) shipped alongside. The `postgres.RunMigrations` +path applied at server boot only runs the `*.up.sql` files. The +`*.down.sql` files exist for development reference + a hypothetical +"surgical revert" path but are **not invoked by `helm rollback`**. + +The implication: if `v2.1.0 → v2.2.0` ships migrations 000100, +000101, 000102 (adding columns, changing constraints, dropping +indexes), then `helm rollback` to v2.1.0 takes you back to the v2.1.0 +container image — but the database still has migrations 000100-102 +applied. The v2.1.0 server code doesn't know about those columns; it +either ignores them (best case) or fails to start (if the schema +diverged in a way the older code can't tolerate). + +### When is rollback safe without a schema revert? + +Migrations are **additive-only** in 90%+ of cases. The categories: + +| Migration class | Safe to roll back without schema revert? | Why | +|---|---|---| +| Add column with default | Yes | Old code ignores the new column | +| Add table | Yes | Old code doesn't reference the table | +| Add index | Yes | Old code doesn't depend on the index existing | +| Add CHECK / FOREIGN KEY constraint | Usually yes | Only fails on row data inserted by new code that violates the old code's constraints | +| Rename column / table | NO | Old code's queries reference the original name | +| Drop column / table | NO (data loss) | New code already stopped writing the column; old code expects it | +| Type change (`VARCHAR(40)` → `TEXT`) | Usually yes | Old code's column read still works | +| Backfill a column | Yes | Old code ignores the backfilled value | + +If your upgrade only added columns / tables / indexes, `helm +rollback` is sufficient. If it renamed or dropped anything, you need +a database-level revert. + +## Procedure: standard rollback (additive-only migrations) + +```bash +# 1. Identify the target revision +helm history certctl -n + +# 2. Take a backup BEFORE rolling back (defense in depth — if +# rollback exposes a data corruption issue, restore is the only +# path back) +# See docs/operator/runbooks/postgres-backup.md for the canonical +# pg_dump invocation. + +# 3. Roll back to the chosen revision +helm rollback certctl -n --wait --timeout 5m + +# 4. Verify +kubectl get pods -n -l app.kubernetes.io/instance=certctl +kubectl logs -n -l app.kubernetes.io/component=server --tail=50 +``` + +Watch for migration-version mismatch warnings in the server logs. If +the older server code refuses to start because the schema is ahead +of what it knows about, escalate to "rollback with schema revert." + +## Procedure: rollback with schema revert + +This is the rare case. Use it when: +- A column / table was renamed or dropped in the rolled-up release. +- The older code refuses to start with the newer schema. + +```bash +# 1. Take a fresh backup right NOW (the current schema is what we're +# reverting from; if anything goes wrong we want a clean +# forward-recovery option) +kubectl exec -n statefulset/certctl-postgres -- \ + pg_dump --format=custom --no-owner --no-acl --dbname=certctl \ + > "certctl-pre-rollback-$(date -u +%Y%m%dT%H%M%SZ).dump" + +# 2. Stop the server Deployment to prevent it from writing to the +# database during the revert +kubectl scale deploy/certctl-server -n --replicas=0 + +# 3. Apply the relevant *.down.sql files manually, one at a time, in +# reverse migration-number order. Example for reverting two +# migrations: +NEW=000102 # newest migration on the running schema +OLD=000100 # oldest migration to revert (inclusive) +for MIG in 000102 000101 000100; do + kubectl exec -i -n statefulset/certctl-postgres -- \ + psql --user=certctl --dbname=certctl \ + < migrations/${MIG}_*.down.sql +done + +# 4. Manually update the schema_migrations table to reflect the +# reverted state (the migration runner's bookkeeping) +kubectl exec -n statefulset/certctl-postgres -- \ + psql --user=certctl --dbname=certctl -c \ + "DELETE FROM schema_migrations WHERE version > $((OLD - 1));" + +# 5. NOW run helm rollback. The server pod will start with a schema +# that matches its code. +helm rollback certctl -n --wait --timeout 5m +``` + +The `*.down.sql` files are tested but only against pristine schemas — +they may not handle every data shape a production database +accumulates. ALWAYS take a backup first; the down-migrations are +a recovery tool, not a transactional contract. + +## Procedure: full restore (when revert isn't tractable) + +When a down-migration would lose data (drop columns / tables that +hold rows the older code can't read but the newer code populated), a +full restore is the only safe path. This is the procedure described +in +[`docs/operator/runbooks/disaster-recovery.md`](disaster-recovery.md#postgres-restore). +The summary: + +1. Stop certctl. +2. Take a backup of the CURRENT schema (defense in depth). +3. Restore the LAST backup taken BEFORE the bad upgrade. +4. Roll the Helm release back to the matching code version. +5. Restart certctl. +6. Re-run any audited writes that happened in the window between the + backup and the bad upgrade (read the audit log; the API surface + is recoverable). + +The DR runbook owns the canonical commands. + +## Common pitfalls + +- **Forgetting the backup before rollback.** A schema-revert path is + not safe without a fresh backup. If something goes wrong mid-revert + and your most recent backup is from last night, you've lost any + cert-issuance history between then and now. +- **Rolling back the chart without rolling back the database state** + on a release that included a destructive migration (drop column, + drop table). Symptoms: old code starts, queries fail with + "column does not exist," server crashes in a loop. Recovery + requires schema revert OR full restore. +- **Letting the agents drift.** `helm rollback` updates the agent + DaemonSet's image too — agents on different versions than the + server may produce incompatible CSR payloads. After rollback, + confirm agent images are at the matching version via + `kubectl get daemonset certctl-agent -o jsonpath='{.spec.template.spec.containers[0].image}'`. +- **GHCR images pinned by digest:** the rollback restores the prior + `image:` value from the Helm template. If your operator workflow + uses `image.digest` pinning, the digest comes back too — make + sure that digest still exists on ghcr.io. They do persist; old + tags are never deleted, but a private mirror may have garbage-collected. + +## Related reading + +- [`docs/operator/runbooks/postgres-backup.md`](postgres-backup.md) — + the backup procedure that's the precondition for any + schema-revert path. +- [`docs/operator/runbooks/disaster-recovery.md`](disaster-recovery.md) — + the full restore procedure when rollback isn't tractable. +- [`docs/migration/api-keys-to-rbac.md`](../../migration/api-keys-to-rbac.md) — + example of a migration that the runtime supports rolling back via + feature flag (rare). diff --git a/scripts/ci-guards/helm-templates-lint.sh b/scripts/ci-guards/helm-templates-lint.sh new file mode 100755 index 0000000..37d3f7a --- /dev/null +++ b/scripts/ci-guards/helm-templates-lint.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# scripts/ci-guards/helm-templates-lint.sh +# +# Phase 4 closure (2026-05-14): Helm chart lint + template-render gate. +# +# Runs `helm lint` against the chart and `helm template` against four +# representative value combinations to catch: +# - Syntax errors in any chart template +# - Schema-violation in values.yaml +# - Missing required values uncovered by the opt-in toggles +# (backup, monitoring.prometheusRules, migrations.viaHook) +# - Render errors when new templates are added without updating +# this guard's coverage matrix +# +# The opt-in templates added in Phase 4 (backup-cronjob.yaml, +# prometheusrules.yaml, migration-job.yaml) default OFF; without +# explicit coverage in the guard's matrix they would never render in +# CI and silent breakage could ship. + +set -euo pipefail + +CHART_DIR="deploy/helm/certctl" + +if [ ! -d "$CHART_DIR" ]; then + echo "helm-templates-lint: skipped — $CHART_DIR not found (running outside repo root?)" + exit 0 +fi + +if ! command -v helm >/dev/null 2>&1; then + echo "helm-templates-lint: skipped — helm not on PATH." + echo " Install: https://helm.sh/docs/intro/install/" + exit 0 +fi + +echo "helm-templates-lint: running helm lint" +helm lint "$CHART_DIR" >/dev/null + +# Minimal valid value set to satisfy chart preflight validators +# (server.tls.existingSecret, server.auth.apiKey, postgresql.auth.password). +# These are NOT real secrets — they're just non-empty strings to +# make the chart render in lint mode. +BASE_VALUES=( + --set "server.tls.existingSecret=lint-test-tls" + --set "server.auth.apiKey=lint-test-apikey" + --set "postgresql.auth.password=lint-test-pgpass" +) + +render_and_check() { + local label="$1" + shift + local out + out="$(helm template "$CHART_DIR" "${BASE_VALUES[@]}" "$@" 2>&1)" || { + echo "helm-templates-lint: FAIL — template render error for '$label'" + echo "$out" | tail -20 + return 1 + } + echo "helm-templates-lint: OK — '$label'" +} + +# Matrix: +# 1. Defaults (no Phase 4 opt-ins) — confirms the chart still +# renders cleanly when every Phase 4 feature is off. +# 2. backup.enabled=true (PVC sink) — confirms backup-cronjob renders. +# 3. backup.enabled=true + sink=s3 — confirms S3 sink branch renders. +# 4. monitoring.prometheusRules.enabled=true — confirms PrometheusRule renders. +# 5. migrations.viaHook=true — confirms migration-job hook renders. +# 6. All Phase 4 opt-ins on simultaneously — confirms no template +# interaction breaks the others. +render_and_check "defaults" +render_and_check "backup.enabled (pvc)" \ + --set "backup.enabled=true" +render_and_check "backup.enabled (s3)" \ + --set "backup.enabled=true" \ + --set "backup.sink=s3" \ + --set "backup.s3.bucket=lint-test-bucket" +render_and_check "monitoring.prometheusRules.enabled" \ + --set "monitoring.enabled=true" \ + --set "monitoring.prometheusRules.enabled=true" +render_and_check "migrations.viaHook" \ + --set "migrations.viaHook=true" +render_and_check "all phase 4 opt-ins" \ + --set "backup.enabled=true" \ + --set "monitoring.enabled=true" \ + --set "monitoring.prometheusRules.enabled=true" \ + --set "migrations.viaHook=true" + +echo "helm-templates-lint: all matrix combinations rendered cleanly"