mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 13:41:30 +00:00
6a640ac3e7
Sprint 3 unified-master-audit closure — two Helm-chart correctness
defects with overlapping CI-guard surface.
DEPL-003 — CERTCTL_MIGRATIONS_VIA_HOOK never rendered:
Pre-fix the env var was documented in values.yaml and the
migration-job.yaml comment but never made it into the server
Deployment env block. With migrations.viaHook=true the operator's
intent is 'the pre-install/pre-upgrade Helm Job owns migrations,'
but the server pods, missing the env, ran their own
cmd/server/migrations.go::runBootMigrations alongside the hook
Job, racing on the schema lock.
Fix: render '- name: CERTCTL_MIGRATIONS_VIA_HOOK / value: true'
in server-deployment.yaml under '{{- if .Values.migrations.viaHook }}'.
DEPL-006 — HA example missing rate-limit backend + sessionAffinity:
values-prod-ha.yaml sets replicas:3 but inherited the chart-wide
default rateLimiting.backend=memory (which gives each pod its
own bucket map, effectively tripling the cap on a 3-replica fleet)
AND the chart had no render path for server.service.sessionAffinity
even though docs/operator/runbooks/ha.md instructed operators to
set it for ClientIP-routed sticky sessions.
Fix:
- server-service.yaml gains a conditional sessionAffinity +
sessionAffinityConfig.clientIP.timeoutSeconds render.
- values.yaml grows the matching schema entries (default empty
so single-replica deploys are unaffected).
- values-prod-ha.yaml flips rateLimiting.backend=postgres and
service.sessionAffinity=ClientIP.
- NOTES.txt emits a loud warning when replicas>1 + either toggle
is still in the default state, so the misconfig surfaces at
helm install time instead of in a confused login-flow bug
report a week later.
CI:
scripts/ci-guards/B3-helm-chart-coherence.sh gains 'Check 7'
(DEPL-003 viaHook env render — both positive and negative —
the inverse case catches future drift that drops the {{- if }}
guard) and 'Check 8' (DEPL-006 sessionAffinity render). Both
helm-template through to assert the rendered YAML carries the
expected text.
Closes DEPL-003, DEPL-006.
176 lines
4.0 KiB
YAML
176 lines
4.0 KiB
YAML
# Certctl Production HA Configuration
|
|
# High availability deployment with:
|
|
# - 3 server replicas with pod anti-affinity
|
|
# - Large PostgreSQL storage
|
|
# - Resource limits for production
|
|
# - Prometheus monitoring
|
|
# - Network policies enforcement
|
|
|
|
namespace: certctl
|
|
|
|
server:
|
|
replicas: 3
|
|
|
|
image:
|
|
repository: ghcr.io/certctl-io/certctl
|
|
tag: "2.1.0"
|
|
pullPolicy: IfNotPresent
|
|
|
|
port: 8443
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 512Mi
|
|
|
|
auth:
|
|
type: api-key
|
|
apiKey: "CHANGE_ME_IN_PRODUCTION" # Use --set or sealed-secrets
|
|
|
|
logging:
|
|
level: info
|
|
format: json
|
|
|
|
service:
|
|
type: ClusterIP
|
|
# DEPL-006 closure (Sprint 3, 2026-05-16): with replicas:3, the
|
|
# default round-robin Service load balancing breaks login/CSRF
|
|
# flows because the session cookie + the CSRF token row land on
|
|
# different pods between requests. sessionAffinity: ClientIP
|
|
# routes every connection from a given source IP to the same
|
|
# pod for the configured timeout window. docs/operator/runbooks/ha.md
|
|
# documents this; pre-fix the chart did not actually render it.
|
|
sessionAffinity: ClientIP
|
|
annotations:
|
|
prometheus.io/scrape: "true"
|
|
prometheus.io/port: "8443"
|
|
prometheus.io/path: "/api/v1/metrics/prometheus"
|
|
|
|
issuer:
|
|
local:
|
|
enabled: true
|
|
acme:
|
|
enabled: true
|
|
directoryURL: https://acme-v02.api.letsencrypt.org/directory
|
|
email: admin@example.com
|
|
challengeType: dns-01
|
|
|
|
rateLimiting:
|
|
rps: 500
|
|
burst: 1000
|
|
# DEPL-006 closure (Sprint 3, 2026-05-16): replicas > 1 REQUIRES
|
|
# the postgres backend so per-key buckets are cross-replica-
|
|
# consistent. The default 'memory' backend gives each pod its
|
|
# own bucket map, so a 3-replica fleet effectively triples the
|
|
# configured cap (a client churning across pods bypasses the
|
|
# limit). See deploy/helm/certctl/values.yaml L217-226 for the
|
|
# canonical comment.
|
|
backend: postgres
|
|
|
|
postgresql:
|
|
enabled: true
|
|
|
|
image:
|
|
repository: postgres
|
|
tag: "16-alpine"
|
|
pullPolicy: IfNotPresent
|
|
|
|
auth:
|
|
database: certctl
|
|
username: certctl
|
|
password: "CHANGE_ME_IN_PRODUCTION" # Use --set or sealed-secrets
|
|
|
|
storage:
|
|
size: 100Gi
|
|
storageClass: "fast-ssd" # Use your high-performance storage class
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 2000m
|
|
memory: 2Gi
|
|
|
|
agent:
|
|
enabled: true
|
|
kind: DaemonSet
|
|
|
|
image:
|
|
repository: ghcr.io/certctl-io/certctl-agent
|
|
tag: "2.1.0"
|
|
pullPolicy: IfNotPresent
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 128Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 256Mi
|
|
|
|
discoveryDirs: "/etc/ssl/certs,/etc/pki/tls,/etc/ssl"
|
|
|
|
ingress:
|
|
enabled: true
|
|
className: nginx
|
|
annotations:
|
|
cert-manager.io/cluster-issuer: letsencrypt-prod
|
|
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
|
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
|
hosts:
|
|
- host: certctl.example.com
|
|
paths:
|
|
- path: /
|
|
pathType: Prefix
|
|
tls:
|
|
- secretName: certctl-tls
|
|
hosts:
|
|
- certctl.example.com
|
|
|
|
serviceAccount:
|
|
create: true
|
|
annotations:
|
|
eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT:role/certctl-role # For IRSA on AWS
|
|
|
|
rbac:
|
|
create: true
|
|
|
|
podDisruptionBudget:
|
|
enabled: true
|
|
minAvailable: 2
|
|
|
|
monitoring:
|
|
enabled: true
|
|
serviceMonitor:
|
|
enabled: true
|
|
interval: 30s
|
|
scrapeTimeout: 10s
|
|
|
|
# Pod anti-affinity for HA
|
|
podAntiAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
- labelSelector:
|
|
matchExpressions:
|
|
- key: app.kubernetes.io/name
|
|
operator: In
|
|
values:
|
|
- certctl
|
|
- key: app.kubernetes.io/component
|
|
operator: In
|
|
values:
|
|
- server
|
|
topologyKey: kubernetes.io/hostname
|
|
|
|
customLabels:
|
|
environment: production
|
|
team: platform
|
|
cost-center: ops
|
|
|
|
customAnnotations:
|
|
slack-alerts: "#ops"
|
|
backup-policy: daily
|