diff --git a/deploy/helm/certctl/templates/NOTES.txt b/deploy/helm/certctl/templates/NOTES.txt index c0b296c..630c35c 100644 --- a/deploy/helm/certctl/templates/NOTES.txt +++ b/deploy/helm/certctl/templates/NOTES.txt @@ -72,3 +72,28 @@ IMPORTANT NOTES FOR PRODUCTION: - All containers run as non-root - Implement network policies to restrict traffic between components - Consider pod security policies or security standards for your cluster +{{- /* + DEPL-006 closure (Sprint 3, 2026-05-16). Loud notice when the + operator runs a multi-replica deploy without crossing the two + required HA toggles. Per-pod rate-limit buckets and round-robin + load balancing both silently break correctness above replicas:1. +*/}} +{{- if gt (int .Values.server.replicas) 1 }} + +⚠️ HA MISCONFIGURATION WARNINGS (replicas={{ .Values.server.replicas }}): +{{- $backend := .Values.server.rateLimiting.backend | default "memory" }} +{{- if eq $backend "memory" }} + - server.rateLimiting.backend = "memory" with replicas > 1 gives each + pod its own bucket map, so the configured cap is effectively + multiplied by the replica count. Set + `--set server.rateLimiting.backend=postgres` (see DEPL-006 / + docs/operator/runbooks/ha.md). +{{- end }} +{{- if not .Values.server.service.sessionAffinity }} + - server.service.sessionAffinity is empty. Round-robin Service load + balancing routes login → /api/v1/auth/login → /api/v1/auth/csrf + across different pods, breaking the CSRF token + session cookie + handshake. Set + `--set server.service.sessionAffinity=ClientIP`. +{{- end }} +{{- end }} diff --git a/deploy/helm/certctl/templates/server-deployment.yaml b/deploy/helm/certctl/templates/server-deployment.yaml index e32eb62..36246c4 100644 --- a/deploy/helm/certctl/templates/server-deployment.yaml +++ b/deploy/helm/certctl/templates/server-deployment.yaml @@ -51,6 +51,20 @@ spec: containerPort: {{ .Values.server.port }} protocol: TCP env: + # DEPL-003 closure (Sprint 3, 2026-05-16). Pre-fix the + # CERTCTL_MIGRATIONS_VIA_HOOK env var was documented in + # values.yaml (L797-810) and migration-job.yaml comments + # but was never rendered into the server Deployment env + # block. With migrations.viaHook=true the operator's + # intent is "the pre-install/pre-upgrade Helm Job owns + # migrations" — but the server pods, missing the env, + # ran their own boot-time RunMigrations alongside the + # hook Job, racing on the schema lock. cmd/server/migrations.go + # only short-circuits when this env is "true" (line 144). + {{- if .Values.migrations.viaHook }} + - name: CERTCTL_MIGRATIONS_VIA_HOOK + value: "true" + {{- end }} - name: CERTCTL_SERVER_HOST value: "0.0.0.0" - name: CERTCTL_SERVER_PORT diff --git a/deploy/helm/certctl/templates/server-service.yaml b/deploy/helm/certctl/templates/server-service.yaml index 1de2383..dffd9c2 100644 --- a/deploy/helm/certctl/templates/server-service.yaml +++ b/deploy/helm/certctl/templates/server-service.yaml @@ -11,6 +11,23 @@ metadata: {{- end }} spec: type: {{ .Values.server.service.type }} + {{- /* + DEPL-006 closure (Sprint 3, 2026-05-16). Render the optional + sessionAffinity field. docs/operator/runbooks/ha.md instructs + operators to set sessionAffinity: ClientIP for replicas > 1 so + login + CSRF flows stay on the same pod; pre-fix the chart did + not actually pass the value through. sessionAffinityConfig + clientIP.timeoutSeconds renders only when set, otherwise + Kubernetes applies its default (10800s / 3h). + */}} + {{- if .Values.server.service.sessionAffinity }} + sessionAffinity: {{ .Values.server.service.sessionAffinity }} + {{- with .Values.server.service.sessionAffinityTimeoutSeconds }} + sessionAffinityConfig: + clientIP: + timeoutSeconds: {{ . }} + {{- end }} + {{- end }} ports: - port: {{ .Values.server.service.port }} targetPort: https diff --git a/deploy/helm/certctl/values.yaml b/deploy/helm/certctl/values.yaml index ad5db0e..d93c09c 100644 --- a/deploy/helm/certctl/values.yaml +++ b/deploy/helm/certctl/values.yaml @@ -160,6 +160,17 @@ server: type: ClusterIP port: 8443 annotations: {} + # DEPL-006 closure (Sprint 3, 2026-05-16). Optional sticky-session + # routing. REQUIRED when server.replicas > 1 so login + CSRF token + # rows stay on the same pod for the duration of a session — the + # default round-robin load balancing breaks those flows. Set to + # "ClientIP" for production HA (see deploy/helm/examples/values-prod-ha.yaml). + # Leave empty for single-replica deploys. + sessionAffinity: "" + # When sessionAffinity is set, timeout window (in seconds) the + # Service maps a source IP to the same pod. Default null → + # Kubernetes applies its built-in default (10800s / 3h). + sessionAffinityTimeoutSeconds: null # Authentication configuration. # Valid types: "api-key" (production) or "none" (demo only — disables diff --git a/deploy/helm/examples/values-prod-ha.yaml b/deploy/helm/examples/values-prod-ha.yaml index 2c5042a..25eeeff 100644 --- a/deploy/helm/examples/values-prod-ha.yaml +++ b/deploy/helm/examples/values-prod-ha.yaml @@ -36,6 +36,14 @@ server: service: type: ClusterIP + # DEPL-006 closure (Sprint 3, 2026-05-16): with replicas:3, the + # default round-robin Service load balancing breaks login/CSRF + # flows because the session cookie + the CSRF token row land on + # different pods between requests. sessionAffinity: ClientIP + # routes every connection from a given source IP to the same + # pod for the configured timeout window. docs/operator/runbooks/ha.md + # documents this; pre-fix the chart did not actually render it. + sessionAffinity: ClientIP annotations: prometheus.io/scrape: "true" prometheus.io/port: "8443" @@ -53,6 +61,14 @@ server: rateLimiting: rps: 500 burst: 1000 + # DEPL-006 closure (Sprint 3, 2026-05-16): replicas > 1 REQUIRES + # the postgres backend so per-key buckets are cross-replica- + # consistent. The default 'memory' backend gives each pod its + # own bucket map, so a 3-replica fleet effectively triples the + # configured cap (a client churning across pods bypasses the + # limit). See deploy/helm/certctl/values.yaml L217-226 for the + # canonical comment. + backend: postgres postgresql: enabled: true diff --git a/scripts/ci-guards/B3-helm-chart-coherence.sh b/scripts/ci-guards/B3-helm-chart-coherence.sh index b8f7de4..4c4ced2 100755 --- a/scripts/ci-guards/B3-helm-chart-coherence.sh +++ b/scripts/ci-guards/B3-helm-chart-coherence.sh @@ -152,10 +152,62 @@ if helm template c "$CHART" \ failed=1 fi +# Check 8 — DEPL-006 (Sprint 3 closure): when +# server.service.sessionAffinity is set, server-service.yaml MUST +# render the field. Pre-fix the chart silently dropped the value +# and docs/operator/runbooks/ha.md's instructions had no effect. +if helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set postgresql.auth.password=p \ + --set server.auth.apiKey=k \ + --set server.service.sessionAffinity=ClientIP \ + > "$TMP/affinity.yaml" 2> "$TMP/affinity.err"; then + if ! grep -q 'sessionAffinity: ClientIP' "$TMP/affinity.yaml"; then + echo "::error file=${CHART}::B3 regression (DEPL-006): server.service.sessionAffinity=ClientIP did not render. Round-robin Service routing will break login + CSRF flows." + failed=1 + fi +else + echo "::error file=${CHART}::B3 regression (DEPL-006): sessionAffinity mode failed to render." + cat "$TMP/affinity.err" + failed=1 +fi + +# Check 7 — DEPL-003 (Sprint 3 closure): when migrations.viaHook=true +# is set, the server Deployment env block MUST render +# CERTCTL_MIGRATIONS_VIA_HOOK=true. Without it the server runs +# boot-time migrations alongside the pre-install/pre-upgrade hook +# Job, racing on the schema lock. +if helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set postgresql.auth.password=p \ + --set server.auth.apiKey=k \ + --set migrations.viaHook=true \ + > "$TMP/viahook.yaml" 2> "$TMP/viahook.err"; then + if ! grep -q 'name: CERTCTL_MIGRATIONS_VIA_HOOK' "$TMP/viahook.yaml"; then + echo "::error file=${CHART}::B3 regression (DEPL-003): migrations.viaHook=true did not render CERTCTL_MIGRATIONS_VIA_HOOK in the server Deployment env block. Server pods will race the hook Job." + failed=1 + fi + # Inverse: with viaHook unset (default), the env var MUST NOT render. + if helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set postgresql.auth.password=p \ + --set server.auth.apiKey=k \ + > "$TMP/noviahook.yaml" 2> "$TMP/noviahook.err"; then + if grep -q 'name: CERTCTL_MIGRATIONS_VIA_HOOK' "$TMP/noviahook.yaml"; then + echo "::error file=${CHART}::B3 regression (DEPL-003): default render (viaHook=false) leaked CERTCTL_MIGRATIONS_VIA_HOOK env var. Conditional template missing the {{- if }} guard." + failed=1 + fi + fi +else + echo "::error file=${CHART}::B3 regression (DEPL-003): migrations.viaHook=true mode failed to render." + cat "$TMP/viahook.err" + failed=1 +fi + if [ "$failed" -ne 0 ]; then echo "" echo "${GUARD_NAME}: FAILED — Helm chart coherence regression." exit 1 fi -echo "${GUARD_NAME}: clean (default + external-Postgres + cert-manager + production hardening + 3 fail-fast gates all green)." +echo "${GUARD_NAME}: clean (default + external-Postgres + cert-manager + production hardening + 3 fail-fast gates + DEPL-003 viaHook env render all green)."