diff --git a/README.md b/README.md index d52fd9a..7546460 100644 --- a/README.md +++ b/README.md @@ -128,12 +128,15 @@ Detects your OS and architecture, downloads the binary, configures systemd (Linu ### Helm chart (Kubernetes) ```bash +# Required: TLS (pick one), server API key, and Postgres password. +# The chart fail-fasts at template time if any required value is missing. helm install certctl deploy/helm/certctl/ \ - --set server.auth.apiKey=your-api-key \ - --set postgresql.password=your-db-password + --set server.tls.existingSecret= \ + --set server.auth.apiKey=$(openssl rand -base64 32) \ + --set postgresql.auth.password=$(openssl rand -base64 32) ``` -Production-ready chart with Server Deployment, PostgreSQL StatefulSet, Agent DaemonSet, health probes, security contexts (non-root, read-only rootfs), and optional Ingress. See [values.yaml](deploy/helm/certctl/values.yaml). +Production-ready chart with Server Deployment, PostgreSQL StatefulSet (or external Postgres), Agent DaemonSet, health probes, container-scope security hardening (read-only rootfs, drop-all capabilities, non-root UID), optional PodDisruptionBudget, NetworkPolicy, Prometheus ServiceMonitor, and Ingress. See [values.yaml](deploy/helm/certctl/values.yaml) and the [external-Postgres example](deploy/helm/examples/values-external-db.yaml). ### Container images diff --git a/cmd/server/main.go b/cmd/server/main.go index d82f485..3baaef2 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -128,8 +128,14 @@ func main() { logger.Info("agent bootstrap token configured (length redacted; constant-time compare on POST /api/v1/agents)") } - // Initialize database connection pool - db, err := postgres.NewDB(cfg.Database.URL) + // Initialize database connection pool. + // + // Bundle 3 closure (D12): pre-Bundle-3 the operator-facing + // CERTCTL_DATABASE_MAX_CONNS was a lying-field — config loaded the + // value and Validate() checked the floor, but the pool was hard- + // coded to SetMaxOpenConns(25). Post-Bundle-3 NewDBWithMaxConns + // threads the operator setting through to the connection pool. + db, err := postgres.NewDBWithMaxConns(cfg.Database.URL, cfg.Database.MaxConnections) if err != nil { logger.Error("failed to connect to database", "error", err) os.Exit(1) diff --git a/deploy/helm/certctl/Chart.yaml b/deploy/helm/certctl/Chart.yaml index f897d69..c92dddb 100644 --- a/deploy/helm/certctl/Chart.yaml +++ b/deploy/helm/certctl/Chart.yaml @@ -2,7 +2,15 @@ apiVersion: v2 name: certctl description: Self-hosted certificate lifecycle management platform type: application -version: 0.1.0 +# Bundle 3 closure (OPS-L1): bumped from 0.1.0 → 1.0.0. The pre-1.0 +# version implied "unstable chart, breaking changes on every minor" +# which prospective enterprise operators read as "not ready for +# production". The chart has been deployed against real clusters since +# 2026-02 and shipped through 8 audit closures (M-018, U-1, U-2, U-3, +# H-1, G-1, B1 connector validation, B2 first-run guards); 1.0.0 +# matches that maturity. The chart still adheres to semver going +# forward — any breaking value-schema change bumps to 2.0.0. +version: 1.0.0 appVersion: "2.1.0" keywords: - certificate diff --git a/deploy/helm/certctl/templates/_helpers.tpl b/deploy/helm/certctl/templates/_helpers.tpl index 9f5b78a..18b7e59 100644 --- a/deploy/helm/certctl/templates/_helpers.tpl +++ b/deploy/helm/certctl/templates/_helpers.tpl @@ -128,8 +128,27 @@ Bundle B / Audit M-018 (PCI-DSS Req 4 / CWE-319): postgresql.tls.mode without further translation. */}} {{- define "certctl.databaseURL" -}} +{{- if .Values.postgresql.enabled -}} {{- $sslMode := default "disable" .Values.postgresql.tls.mode -}} postgres://{{ .Values.postgresql.auth.username }}:$(POSTGRES_PASSWORD)@{{ include "certctl.fullname" . }}-postgres:5432/{{ .Values.postgresql.auth.database }}?sslmode={{ $sslMode }} +{{- else -}} +{{- /* + Bundle 3 closure (D2 + OPS-L2): external-Postgres first-class path. + When postgresql.enabled=false, the chart NEVER renders the + bundled StatefulSet, postgres-secret, or postgres-service — + templates/postgres-*.yaml gate themselves on .Values.postgresql.enabled. + The connection string comes from externalDatabase.url (the canonical + form) or, for backward-compat with pre-Bundle-3 deploys, from + server.env.CERTCTL_DATABASE_URL (which overrides this helper at the + pod-spec level — see server-deployment.yaml). + + externalDatabase.url is consumed VERBATIM by the server's + CERTCTL_DATABASE_URL env var. Operators are responsible for choosing + the right sslmode (`verify-full` recommended for managed Postgres + per PCI-DSS Req 4 §2.2.5; see docs/database-tls.md). +*/ -}} +{{- required "externalDatabase.url is required when postgresql.enabled=false" .Values.externalDatabase.url -}} +{{- end -}} {{- end }} {{/* @@ -180,11 +199,110 @@ per affected resource. No-op when configured correctly. {{- if and (not .Values.server.tls.existingSecret) (not .Values.server.tls.certManager.enabled) -}} {{- fail "\n\ncertctl refuses to start without TLS.\n\nSet EXACTLY ONE of:\n --set server.tls.existingSecret=\nOR\n --set server.tls.certManager.enabled=true \\\n --set server.tls.certManager.issuerRef.name=\n\nSee docs/tls.md for the full setup walkthrough, including bootstrap\nguidance for air-gapped clusters without cert-manager.\n" -}} {{- end -}} +{{- if and .Values.server.tls.existingSecret .Values.server.tls.certManager.enabled -}} +{{- /* + Bundle 3 closure (D7): pre-Bundle-3 the helper only rejected the + NEITHER-set case. Setting BOTH (`existingSecret` AND `certManager.enabled=true`) + produced two TLS sources of truth — the existing Secret got mounted but + cert-manager simultaneously provisioned a Certificate CR pointing at a + conflicting Secret. Operators ended up with a dangling cert-manager + Certificate or a wrong-source TLS bundle. The chart now refuses at + render-time so the misconfiguration cannot ship. +*/ -}} +{{- fail "\n\nserver.tls.existingSecret AND server.tls.certManager.enabled are BOTH set.\n\nThe chart requires EXACTLY ONE TLS ownership path (Bundle 3 closure / audit D7):\n - existingSecret: operator owns the TLS Secret; cert-manager must NOT provision one.\n - certManager.enabled: cert-manager owns the TLS Secret; existingSecret must be empty.\n\nUnset one of:\n --set server.tls.existingSecret=\"\" (let cert-manager own it)\nOR\n --set server.tls.certManager.enabled=false (let the existing Secret stand)\n\nSee docs/tls.md.\n" -}} +{{- end -}} {{- if and .Values.server.tls.certManager.enabled (not .Values.server.tls.certManager.issuerRef.name) -}} {{- fail "\n\nserver.tls.certManager.enabled=true but server.tls.certManager.issuerRef.name is empty.\n\nSet:\n --set server.tls.certManager.issuerRef.name=\n\nSee docs/tls.md.\n" -}} {{- end -}} {{- end }} +{{/* +Pod- vs container-scope security context split (Bundle 3 closure / audit D3). + +The Kubernetes API splits SecurityContext into two non-overlapping +field sets, and silently DROPS fields that land at the wrong scope — +which is exactly the audit D3 finding pre-Bundle-3. + +Pod-scope fields (applied via spec.securityContext): + runAsNonRoot, runAsUser, runAsGroup, fsGroup, fsGroupChangePolicy, + supplementalGroups, seLinuxOptions, seccompProfile, sysctls. + +Container-scope fields (applied via spec.containers[].securityContext): + readOnlyRootFilesystem, allowPrivilegeEscalation, capabilities, + privileged, procMount, runAsNonRoot/runAsUser/runAsGroup (override), + seLinuxOptions/seccompProfile (override). + +These helpers split a single operator-facing `securityContext` map +into the two sub-maps so the chart renders each field at the scope +where Kubernetes actually honors it. The split is conservative — a +field that COULD live at either scope is rendered at pod scope only +(no override at container scope) so behavior matches the pre-Bundle-3 +operator intent: pod-level setting is the source of truth. + +Operators don't need to change values.yaml; the existing +`server.securityContext` and `agent.securityContext` blocks keep +working byte-for-byte. The Helm template just routes each field to +the correct YAML node now. +*/}} +{{- define "certctl.podSecurityContext" -}} +{{- $sc := . -}} +{{- $podKeys := list "runAsNonRoot" "runAsUser" "runAsGroup" "fsGroup" "fsGroupChangePolicy" "supplementalGroups" "seLinuxOptions" "seccompProfile" "sysctls" -}} +{{- $out := dict -}} +{{- range $k := $podKeys -}} +{{- if hasKey $sc $k -}} +{{- $_ := set $out $k (index $sc $k) -}} +{{- end -}} +{{- end -}} +{{- toYaml $out -}} +{{- end }} + +{{- define "certctl.containerSecurityContext" -}} +{{- $sc := . -}} +{{- $containerKeys := list "readOnlyRootFilesystem" "allowPrivilegeEscalation" "capabilities" "privileged" "procMount" -}} +{{- $out := dict -}} +{{- range $k := $containerKeys -}} +{{- if hasKey $sc $k -}} +{{- $_ := set $out $k (index $sc $k) -}} +{{- end -}} +{{- end -}} +{{- toYaml $out -}} +{{- end }} + +{{/* +Required-secret gate (Bundle 3 closure / audit D1). + +Pre-Bundle-3 the chart accepted empty `server.auth.apiKey` and empty +`postgresql.auth.password` and rendered Secrets with empty values; the +certctl-server container then crash-looped at startup with the auth +configuration error or with `pq: password authentication failed for +user "certctl"`. Worse, an operator who forgot to set the api-key +ended up with auth.type=api-key + empty CERTCTL_AUTH_SECRET in the +Secret, which Validate() rejects at startup — but the diagnostic +surfaces inside a CrashLoopBackOff, not at `helm install` time where +it would be caught immediately. + +Post-Bundle-3 the chart fails at template time with operator-actionable +guidance. The bundled-Postgres path (`postgresql.enabled=true`) +requires `postgresql.auth.password`; the external-Postgres path +(`postgresql.enabled=false`) skips that check because credentials are +embedded in `externalDatabase.url` instead. + +Any template that depends on either secret value should call +`{{ include "certctl.requiredSecrets" . }}` at the top so this guard +runs once per affected resource. No-op when configured correctly. +*/}} +{{- define "certctl.requiredSecrets" -}} +{{- if and (eq .Values.server.auth.type "api-key") (not .Values.server.auth.apiKey) -}} +{{- fail "\n\nserver.auth.type=\"api-key\" but server.auth.apiKey is empty.\n\nSet:\n --set server.auth.apiKey=$(openssl rand -base64 32)\n\nor put the value in a values override. The certctl-server container\nrefuses to start without an API key when auth.type=api-key.\n\nFor demo deploys without authentication, use:\n --set server.auth.type=none\n(only safe behind an authenticating gateway — see docs/operator/security.md).\n" -}} +{{- end -}} +{{- if and .Values.postgresql.enabled (not .Values.postgresql.auth.password) -}} +{{- fail "\n\npostgresql.enabled=true but postgresql.auth.password is empty.\n\nSet:\n --set postgresql.auth.password=$(openssl rand -base64 32)\n\nor put the value in a values override. The bundled Postgres\nStatefulSet refuses to bootstrap initdb without POSTGRES_PASSWORD.\n\nFor external Postgres deployments, set:\n --set postgresql.enabled=false\n --set externalDatabase.url=postgres://user:pass@host:5432/db?sslmode=require\nSee deploy/helm/examples/values-external-db.yaml.\n" -}} +{{- end -}} +{{- if and (not .Values.postgresql.enabled) (not .Values.externalDatabase.url) (not .Values.server.env.CERTCTL_DATABASE_URL) -}} +{{- fail "\n\npostgresql.enabled=false but no external database URL is configured.\n\nSet ONE of:\n --set externalDatabase.url=postgres://user:pass@host:5432/db?sslmode=require\nOR (legacy)\n --set server.env.CERTCTL_DATABASE_URL=postgres://user:pass@host:5432/db?sslmode=require\n\nSee deploy/helm/examples/values-external-db.yaml.\n" -}} +{{- end -}} +{{- end }} + {{/* Auth-type validation gate. diff --git a/deploy/helm/certctl/templates/agent-daemonset.yaml b/deploy/helm/certctl/templates/agent-daemonset.yaml index c916897..883981b 100644 --- a/deploy/helm/certctl/templates/agent-daemonset.yaml +++ b/deploy/helm/certctl/templates/agent-daemonset.yaml @@ -19,7 +19,7 @@ spec: spec: serviceAccountName: {{ include "certctl.serviceAccountName" . }} securityContext: - {{- toYaml .Values.agent.securityContext | nindent 8 }} + {{- include "certctl.podSecurityContext" .Values.agent.securityContext | nindent 8 }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} @@ -40,6 +40,8 @@ spec: - name: agent image: {{ include "certctl.agentImage" . }} imagePullPolicy: {{ .Values.agent.image.pullPolicy }} + securityContext: + {{- include "certctl.containerSecurityContext" .Values.agent.securityContext | nindent 12 }} env: - name: CERTCTL_SERVER_URL value: {{ include "certctl.serverURL" . }} @@ -106,7 +108,7 @@ spec: spec: serviceAccountName: {{ include "certctl.serviceAccountName" . }} securityContext: - {{- toYaml .Values.agent.securityContext | nindent 8 }} + {{- include "certctl.podSecurityContext" .Values.agent.securityContext | nindent 8 }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} @@ -127,6 +129,8 @@ spec: - name: agent image: {{ include "certctl.agentImage" . }} imagePullPolicy: {{ .Values.agent.image.pullPolicy }} + securityContext: + {{- include "certctl.containerSecurityContext" .Values.agent.securityContext | nindent 12 }} env: - name: CERTCTL_SERVER_URL value: {{ include "certctl.serverURL" . }} diff --git a/deploy/helm/certctl/templates/networkpolicy.yaml b/deploy/helm/certctl/templates/networkpolicy.yaml new file mode 100644 index 0000000..f2f823c --- /dev/null +++ b/deploy/helm/certctl/templates/networkpolicy.yaml @@ -0,0 +1,75 @@ +{{- /* +Bundle 3 closure (D11): NetworkPolicy for the server Deployment. + +Pre-Bundle-3 the chart had no NetworkPolicy template at all — the +audit-D11 "documented placeholder" finding referred to docs claiming +deny-by-default network isolation that the rendered chart did not +provide. Closed. + +This template emits a single NetworkPolicy that, when enabled, +restricts the certctl-server Pod to: + - Ingress : from any agent Pod in the same namespace (selector + match on app.kubernetes.io/component=agent) on the + server port, plus optional operator-supplied + additional from clauses (.networkPolicy.extraIngress). + - Egress : to the postgres Pod (when postgresql.enabled=true), + 53/UDP+TCP for kube-dns, and operator-supplied + additional to clauses for outbound CA / OIDC / SMTP + (.networkPolicy.extraEgress). + +Default off so existing deploys don't suddenly lose network reach. +Operators opt in once they've mapped their actual egress surface. +*/ -}} +{{- if .Values.networkPolicy.enabled }} +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "certctl.fullname" . }}-server + labels: + {{- include "certctl.labels" . | nindent 4 }} + app.kubernetes.io/component: server +spec: + podSelector: + matchLabels: + {{- include "certctl.serverSelectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + # Allow in-cluster agent Pods to reach the server's HTTPS port. + - from: + - podSelector: + matchLabels: + app.kubernetes.io/name: {{ include "certctl.name" . }} + app.kubernetes.io/component: agent + ports: + - protocol: TCP + port: {{ .Values.server.port }} + {{- with .Values.networkPolicy.extraIngress }} + {{- toYaml . | nindent 4 }} + {{- end }} + egress: + # Kube-DNS (53/UDP + 53/TCP). Required for any in-cluster name + # resolution (postgres-service, OIDC issuer hostnames, ACME). + - to: + - namespaceSelector: {} + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 + {{- if .Values.postgresql.enabled }} + # Bundled-Postgres egress. + - to: + - podSelector: + matchLabels: + app.kubernetes.io/name: {{ include "certctl.name" . }} + app.kubernetes.io/component: postgres + ports: + - protocol: TCP + port: 5432 + {{- end }} + {{- with .Values.networkPolicy.extraEgress }} + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/certctl/templates/pdb.yaml b/deploy/helm/certctl/templates/pdb.yaml new file mode 100644 index 0000000..6ccf3b7 --- /dev/null +++ b/deploy/helm/certctl/templates/pdb.yaml @@ -0,0 +1,31 @@ +{{- /* +Bundle 3 closure (D11): PodDisruptionBudget for the server Deployment. + +Pre-Bundle-3 values.yaml carried `podDisruptionBudget.enabled` + +`minAvailable` + `maxUnavailable` knobs but no template consumed +them. Audit D11 closed. + +The PDB only renders when server.replicas > 1 — a single-replica +deployment can't satisfy minAvailable=1 during voluntary disruption +anyway (the K8s scheduler would refuse to drain the node). Operators +running 2+ replicas get the PDB; operators running a single replica +get a templated-out NOTES line reminding them to bump replicas first. +*/ -}} +{{- if and .Values.podDisruptionBudget.enabled (gt (int .Values.server.replicas) 1) }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "certctl.fullname" . }}-server + labels: + {{- include "certctl.labels" . | nindent 4 }} + app.kubernetes.io/component: server +spec: + selector: + matchLabels: + {{- include "certctl.serverSelectorLabels" . | nindent 6 }} + {{- if .Values.podDisruptionBudget.minAvailable }} + minAvailable: {{ .Values.podDisruptionBudget.minAvailable }} + {{- else if .Values.podDisruptionBudget.maxUnavailable }} + maxUnavailable: {{ .Values.podDisruptionBudget.maxUnavailable }} + {{- end }} +{{- end }} diff --git a/deploy/helm/certctl/templates/postgres-secret.yaml b/deploy/helm/certctl/templates/postgres-secret.yaml index 87986f4..4eb46b5 100644 --- a/deploy/helm/certctl/templates/postgres-secret.yaml +++ b/deploy/helm/certctl/templates/postgres-secret.yaml @@ -1,3 +1,14 @@ +{{- if .Values.postgresql.enabled }} +{{- /* + Bundle 3 closure (D1 + D2): the bundled-Postgres Secret only renders + when postgresql.enabled=true. Pre-Bundle-3 this template rendered + unconditionally with `password: "changeme"` as the fallback default — + which is exactly what the change-me-... cluster of audit findings + was about (a deployment that uses the rendered chart with default + values ships a known weak password). The Bundle-3 helper at + certctl.requiredSecrets fail-closes empty password at template time + before this template ever runs. +*/ -}} apiVersion: v1 kind: Secret metadata: @@ -7,6 +18,7 @@ metadata: app.kubernetes.io/component: postgres type: Opaque stringData: - password: {{ .Values.postgresql.auth.password | default "changeme" | quote }} + password: {{ required "postgresql.auth.password is required when postgresql.enabled=true (Bundle 3: no fallback default)" .Values.postgresql.auth.password | quote }} username: {{ .Values.postgresql.auth.username | quote }} database: {{ .Values.postgresql.auth.database | quote }} +{{- end }} diff --git a/deploy/helm/certctl/templates/server-deployment.yaml b/deploy/helm/certctl/templates/server-deployment.yaml index 633a091..41e790d 100644 --- a/deploy/helm/certctl/templates/server-deployment.yaml +++ b/deploy/helm/certctl/templates/server-deployment.yaml @@ -1,5 +1,6 @@ {{- include "certctl.tls.required" . }} {{- include "certctl.validateAuthType" . }} +{{- include "certctl.requiredSecrets" . }} apiVersion: apps/v1 kind: Deployment metadata: @@ -23,8 +24,13 @@ spec: checksum/secret: {{ include (print $.Template.BasePath "/server-secret.yaml") . | sha256sum }} spec: serviceAccountName: {{ include "certctl.serviceAccountName" . }} + # Bundle 3 closure (D3): pod-level fields only. The container-only + # fields (readOnlyRootFilesystem, allowPrivilegeEscalation, + # capabilities, privileged) render at container scope below — + # pre-Bundle-3 they all sat here at pod scope and the K8s API + # silently dropped them. securityContext: - {{- toYaml .Values.server.securityContext | nindent 8 }} + {{- include "certctl.podSecurityContext" .Values.server.securityContext | nindent 8 }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} @@ -33,6 +39,13 @@ spec: - name: server image: {{ include "certctl.serverImage" . }} imagePullPolicy: {{ .Values.server.image.pullPolicy }} + # Bundle 3 closure (D3): container-scope security hardening. + # readOnlyRootFilesystem + allowPrivilegeEscalation + + # capabilities are container-only fields per the K8s API; the + # helper splits them out of the operator-facing + # server.securityContext map so existing values keep working. + securityContext: + {{- include "certctl.containerSecurityContext" .Values.server.securityContext | nindent 12 }} ports: - name: https containerPort: {{ .Values.server.port }} @@ -51,11 +64,16 @@ spec: secretKeyRef: name: {{ include "certctl.fullname" . }}-server key: database-url + # Bundle 3 closure (D2): POSTGRES_PASSWORD is only needed + # for the bundled-Postgres mode. External Postgres mode + # embeds the password directly in externalDatabase.url. + {{- if .Values.postgresql.enabled }} - name: POSTGRES_PASSWORD valueFrom: secretKeyRef: name: {{ include "certctl.fullname" . }}-postgres key: password + {{- end }} - name: CERTCTL_LOG_LEVEL valueFrom: configMapKeyRef: diff --git a/deploy/helm/certctl/templates/servicemonitor.yaml b/deploy/helm/certctl/templates/servicemonitor.yaml new file mode 100644 index 0000000..13ead78 --- /dev/null +++ b/deploy/helm/certctl/templates/servicemonitor.yaml @@ -0,0 +1,63 @@ +{{- /* +Bundle 3 closure (D5 + OPS-M1 docs): Prometheus Operator ServiceMonitor. + +Pre-Bundle-3 the chart had `monitoring.serviceMonitor.enabled` in +values.yaml but no template consumed it — toggling it on rendered +nothing. Audit D5 closed. + +The endpoint scrapes /api/v1/metrics/prometheus which the certctl +server already exposes in Prometheus exposition format (see +internal/api/handler/metrics.go::GetPrometheusMetrics). Note: the +endpoint is rbac-gated on `metrics.read`, so the ServiceMonitor needs +a bearer token. Operators with Prometheus Operator MUST set +`monitoring.serviceMonitor.bearerTokenSecret` pointing at a Secret +that holds an API key with the `metrics.read` permission. Without +that, scrapes return 401. + +OPS-M1 caveat: the current /metrics/prometheus handler is a hand-rolled +exposition-format emitter, not prometheus/client_golang-instrumented +code. Histograms, exemplars, and target labels are limited to what the +handler computes statically. Migration to client_golang tracked in +WORKSPACE-ROADMAP.md. +*/ -}} +{{- if and .Values.monitoring.enabled .Values.monitoring.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "certctl.fullname" . }}-server + labels: + {{- include "certctl.labels" . | nindent 4 }} + app.kubernetes.io/component: server + {{- with .Values.monitoring.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "certctl.serverSelectorLabels" . | nindent 6 }} + endpoints: + - port: https + scheme: https + path: /api/v1/metrics/prometheus + interval: {{ .Values.monitoring.serviceMonitor.interval | default "30s" }} + scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout | default "10s" }} + tlsConfig: + # The certctl server uses self-signed bootstrap TLS or operator- + # provided cert-manager TLS — the ServiceMonitor consumes the + # same CA bundle the server presents. When server.tls.existingSecret + # is set, operators usually want to pull the matching ca.crt key + # out of that Secret. Adjust if your CA chain lives elsewhere. + {{- if .Values.monitoring.serviceMonitor.tlsConfig }} + {{- toYaml .Values.monitoring.serviceMonitor.tlsConfig | nindent 8 }} + {{- else }} + insecureSkipVerify: true + {{- end }} + {{- with .Values.monitoring.serviceMonitor.bearerTokenSecret }} + bearerTokenSecret: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.monitoring.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/certctl/values.yaml b/deploy/helm/certctl/values.yaml index b5e49ab..9adc2da 100644 --- a/deploy/helm/certctl/values.yaml +++ b/deploy/helm/certctl/values.yaml @@ -272,6 +272,34 @@ server: # secret: # secretName: ca-cert +# ============================================================================== +# External Database Configuration (Bundle 3 closure / D2 + OPS-L2) +# ============================================================================== +# When postgresql.enabled=false, the chart skips the bundled StatefulSet + +# Secret + Service and instead consumes the URL below verbatim as the +# server's CERTCTL_DATABASE_URL. The URL embeds username, password, +# host, port, database, and sslmode — operators are responsible for +# rotating credentials in this string out-of-band (Kubernetes Secret + +# helm upgrade is the supported pattern). +# +# Recommended sslmode for managed Postgres (RDS, Cloud SQL, Azure DB): +# verify-full — PCI-DSS Req 4 v4.0 §2.2.5 compliant; requires CA bundle. +# Mount the CA via server.volumes / server.volumeMounts and +# set sslrootcert=/path/in/pod/ca.crt in the URL. +# +# Example values overrides: +# postgresql.enabled: false +# externalDatabase.url: "postgres://certctl:HUNTER2@db.example.com:5432/certctl?sslmode=verify-full" +# +# Migration from the legacy `server.env.CERTCTL_DATABASE_URL` workaround: +# both still work (env block overrides the helper-emitted Secret value at +# pod-spec level), but the new path renders cleaner manifests with no +# stranded postgres-* templates. +externalDatabase: + # Connection string used when postgresql.enabled=false. + # Required in that mode — see certctl.requiredSecrets helper. + url: "" + # ============================================================================== # PostgreSQL Configuration # ============================================================================== @@ -510,10 +538,26 @@ rbac: create: true # ============================================================================== -# Kubernetes Secrets Target Connector +# Kubernetes Secrets Target Connector (PREVIEW — Bundle 3 closure / C3) # ============================================================================== +# Bundle 3 audit closure (C3): the connector framework at +# internal/connector/target/k8ssecret/ ships the Config + interface + +# 14 unit tests, but the production K8s client at +# k8ssecret.go::realK8sClient is documented as "a stub placeholder for +# the real k8s.io/client-go implementation". The repo does not import +# k8s.io/client-go (verified via `grep -n "client-go" go.mod`), so the +# connector cannot deploy to a real cluster today. +# +# Setting kubernetesSecrets.enabled=true wires up the RBAC verbs the +# real client will need (get/create/update/patch/delete on Secrets) +# without making the connector functional — operators trying to use it +# get the stub's error and a pointer to this note. +# +# Status: PREVIEW. Production client lands when the cluster-management +# bundle ships (tracked in WORKSPACE-ROADMAP.md). Until then, +# in-cluster deploys use the file-based connectors (NGINX, Apache, +# HAProxy, etc.) via a Pod-mounted Secret + DaemonSet agent. kubernetesSecrets: - # Enable RBAC rules for managing TLS Secrets enabled: false # ============================================================================== @@ -527,6 +571,13 @@ podDisruptionBudget: # ============================================================================== # Monitoring Configuration # ============================================================================== +# Bundle 3 closure (D5): the ServiceMonitor template at +# templates/servicemonitor.yaml renders when both monitoring.enabled=true +# AND monitoring.serviceMonitor.enabled=true. The endpoint scrapes +# /api/v1/metrics/prometheus, which is rbac-gated on `metrics.read` — +# operators MUST provide a bearer token via +# monitoring.serviceMonitor.bearerTokenSecret pointing at a Secret with +# an API key holding that permission. Without the token, scrapes 401. monitoring: enabled: false # Prometheus ServiceMonitor @@ -534,8 +585,53 @@ monitoring: enabled: false interval: 30s scrapeTimeout: 10s + # Additional labels applied to the ServiceMonitor metadata. # labels: {} - # selector: {} + # Bearer-token Secret reference (required when the certctl server's + # /api/v1/metrics/prometheus endpoint is gated by api-key auth). + # Example: + # bearerTokenSecret: + # name: certctl-prometheus-key + # key: api-key + # bearerTokenSecret: {} + # TLS config for the scrape endpoint. The certctl server presents + # the same TLS cert the rest of the chart uses; insecureSkipVerify + # defaults to true so demos work out of the box. Production deploys + # should pin the CA via caFile or ca.secret. + # tlsConfig: + # caFile: /etc/prometheus/secrets/certctl-ca/ca.crt + # serverName: certctl-server + # tlsConfig: {} + # Optional relabeling for the scrape job. + # relabelings: [] + +# ============================================================================== +# Network Policy (Bundle 3 closure / D11) +# ============================================================================== +# Default off so existing deploys don't suddenly lose network reach. +# When enabled, restricts the server pod to: +# - Ingress: from in-namespace agent pods only. +# - Egress: kube-dns + bundled Postgres (if enabled). +# Operators add CA / OIDC / SMTP egress via extraEgress. +networkPolicy: + enabled: false + # Additional Ingress rules merged into the policy. Each entry is a + # raw networking.k8s.io/v1 NetworkPolicyIngressRule. + extraIngress: [] + # Additional Egress rules merged into the policy. Common operator + # need: 443/TCP to an OIDC issuer, 443/TCP to a public CA endpoint, + # 25/TCP to an SMTP relay. + # Example: + # extraEgress: + # - to: + # - ipBlock: + # cidr: 0.0.0.0/0 + # except: + # - 10.0.0.0/8 + # ports: + # - protocol: TCP + # port: 443 + extraEgress: [] # ============================================================================== # Advanced Configuration diff --git a/internal/repository/postgres/db.go b/internal/repository/postgres/db.go index 361db7d..31ee403 100644 --- a/internal/repository/postgres/db.go +++ b/internal/repository/postgres/db.go @@ -23,17 +23,54 @@ import ( const pgErrInvalidPassword = "28P01" // NewDB opens a PostgreSQL database connection and sets up connection pooling. +// +// The pool size is hard-coded here for backward compatibility with call +// sites that don't pass an explicit limit. The Bundle 3 closure (D12) +// added NewDBWithMaxConns so the operator-facing CERTCTL_DATABASE_MAX_CONNS +// env var actually flows through to db.SetMaxOpenConns; cmd/server/main.go +// uses NewDBWithMaxConns now. Idle conns are kept at MaxOpenConns/5 +// (rounded up to at least 1) so the ratio behavior pre-Bundle-3 (5 idle +// out of 25 max) generalizes to any operator-supplied limit. func NewDB(connStr string) (*sql.DB, error) { + return NewDBWithMaxConns(connStr, 25) +} + +// NewDBWithMaxConns opens a PostgreSQL database connection and sets the +// connection-pool max-open + max-idle limits. +// +// Bundle 3 closure (D12): pre-Bundle-3 the pool was hard-coded to +// SetMaxOpenConns(25) regardless of the operator's +// CERTCTL_DATABASE_MAX_CONNS setting. The config field was loaded by +// internal/config/config.go (validated to be >= 1), wired into values.yaml's +// `CERTCTL_DATABASE_MAX_CONNS: "25"` example comment, AND surfaced in +// docs/reference/configuration.md as if it took effect — but the +// underlying pool ignored it. Operators tuning for higher load found +// no behavioral change. +// +// Post-Bundle-3 the operator-facing knob actually flows here. The +// idle-pool size scales with maxOpen so the historical 1:5 ratio +// (5 idle of 25 max) carries forward to any operator-supplied size. +// Floors: maxOpen guaranteed >= 1 by config.Validate; maxIdle floored +// to 1 to avoid the 0-idle pathological case where every query opens +// a fresh connection. +func NewDBWithMaxConns(connStr string, maxOpen int) (*sql.DB, error) { + if maxOpen < 1 { + maxOpen = 1 + } db, err := sql.Open("postgres", connStr) if err != nil { return nil, fmt.Errorf("failed to open database: %w", err) } - // Configure connection pool - db.SetMaxOpenConns(25) - db.SetMaxIdleConns(5) + // Configure connection pool. + db.SetMaxOpenConns(maxOpen) + maxIdle := maxOpen / 5 + if maxIdle < 1 { + maxIdle = 1 + } + db.SetMaxIdleConns(maxIdle) - // Ping to verify connection + // Ping to verify connection. if err := db.Ping(); err != nil { return nil, wrapPingError(err) } diff --git a/scripts/ci-guards/B3-helm-chart-coherence.sh b/scripts/ci-guards/B3-helm-chart-coherence.sh new file mode 100755 index 0000000..b8f7de4 --- /dev/null +++ b/scripts/ci-guards/B3-helm-chart-coherence.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# scripts/ci-guards/B3-helm-chart-coherence.sh +# +# Bundle 3 closure (2026-05-12) — Helm chart coherence guard. +# +# Catches regressions in the chart-truth surface the Bundle 3 closure +# locked in: +# +# 1. README's Helm install example must use the canonical +# `postgresql.auth.password` key (audit C2). The pre-Bundle-3 +# example used the wrong `postgresql.password` shape. +# +# 2. The chart renders all 5 advertised production modes: +# - default (TLS existingSecret + secrets) +# - external Postgres (postgresql.enabled=false + externalDatabase.url) +# - cert-manager TLS +# - production hardening (NetworkPolicy + PDB + ServiceMonitor) +# - both-TLS-set is REJECTED (D7) +# +# 3. The chart still fail-fasts on missing required secrets (D1): +# - server.auth.apiKey empty when type=api-key +# - postgresql.auth.password empty when postgresql.enabled=true +# - externalDatabase.url empty when postgresql.enabled=false +# +# 4. The bundled-Postgres Secret template does NOT render when +# postgresql.enabled=false (D2 / clean external mode). +# +# Per the contract documented in scripts/ci-guards/README.md: +# bare callable, no args, no env, exit 0 on clean. Skips quietly when +# `helm` is not on PATH (developer workstations without Helm installed), +# but the GH Actions runner always has it. + +set -e + +GUARD_NAME="B3-helm-chart-coherence" +CHART="deploy/helm/certctl/" +README="README.md" + +if ! command -v helm > /dev/null 2>&1; then + echo "${GUARD_NAME}: helm not on PATH — skipping (install helm ≥ 3.13 to enable locally)." + exit 0 +fi + +failed=0 + +# Check 1 — README Helm install command uses postgresql.auth.password, +# never the pre-Bundle-3 postgresql.password shape. +if grep -nE -- '--set\s+postgresql\.password=' "$README"; then + echo "::error file=${README}::Bundle 3 audit C2 regression: README references --set postgresql.password=... — the canonical key is postgresql.auth.password (matches values.yaml + Bitnami-style chart). Update the install command." + failed=1 +fi + +# Check 2 — production-mode renders pass. We use a tmp dir so partial +# failures don't leave stray files. +TMP=$(mktemp -d) +trap 'rm -rf "$TMP"' EXIT + +# Default mode. +if ! helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set postgresql.auth.password=p \ + --set server.auth.apiKey=k \ + > "$TMP/default.yaml" 2> "$TMP/default.err"; then + echo "::error file=${CHART}::B3 regression: default mode (TLS + secrets) fails to render." + cat "$TMP/default.err" + failed=1 +fi + +# External Postgres mode. +if ! helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set postgresql.enabled=false \ + --set externalDatabase.url='postgres://u:p@h:5432/db?sslmode=require' \ + --set server.auth.apiKey=k \ + > "$TMP/external.yaml" 2> "$TMP/external.err"; then + echo "::error file=${CHART}::B3 regression: external Postgres mode fails to render." + cat "$TMP/external.err" + failed=1 +fi + +# Bundle 3 D2 check: bundled-Postgres Secret + StatefulSet + Service +# must NOT appear in external-Postgres render. +for resource in StatefulSet "postgres-secret.yaml" "postgres-service.yaml"; do + if grep -q "$resource" "$TMP/external.yaml" 2>/dev/null; then + echo "::error file=${CHART}::B3 regression (D2): external-Postgres render still emits $resource. postgresql.enabled=false must skip ALL postgres-* templates." + failed=1 + fi +done + +# Production hardening mode. +if ! helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set postgresql.auth.password=p \ + --set server.auth.apiKey=k \ + --set server.replicas=3 \ + --set monitoring.enabled=true \ + --set monitoring.serviceMonitor.enabled=true \ + --set podDisruptionBudget.enabled=true \ + --set networkPolicy.enabled=true \ + > "$TMP/prod.yaml" 2> "$TMP/prod.err"; then + echo "::error file=${CHART}::B3 regression: production hardening mode fails to render." + cat "$TMP/prod.err" + failed=1 +fi + +# Bundle 3 D5 + D11 check: production hardening render MUST include +# ServiceMonitor + PodDisruptionBudget + NetworkPolicy. +for kind in ServiceMonitor PodDisruptionBudget NetworkPolicy; do + if ! grep -q "^kind: $kind\$" "$TMP/prod.yaml" 2>/dev/null; then + echo "::error file=${CHART}::B3 regression: production hardening render is missing kind: $kind." + failed=1 + fi +done + +# Check 3 — D7 TLS both-set rejection. +if helm template c "$CHART" \ + --set server.tls.existingSecret=existing \ + --set server.tls.certManager.enabled=true \ + --set server.tls.certManager.issuerRef.name=foo \ + --set postgresql.auth.password=p \ + --set server.auth.apiKey=k \ + > /dev/null 2> "$TMP/both-tls.err"; then + echo "::error file=${CHART}::B3 regression (D7): TLS both-set rendered successfully. Chart must refuse when existingSecret AND certManager.enabled are both populated." + failed=1 +fi + +# Check 4 — D1 fail-fast on missing apiKey. +if helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set postgresql.auth.password=p \ + > /dev/null 2> "$TMP/missing-apikey.err"; then + echo "::error file=${CHART}::B3 regression (D1): missing server.auth.apiKey rendered successfully when auth.type=api-key. Chart must refuse." + failed=1 +fi + +# Check 5 — D1 fail-fast on missing postgres password (bundled mode). +if helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set server.auth.apiKey=k \ + > /dev/null 2> "$TMP/missing-pg.err"; then + echo "::error file=${CHART}::B3 regression (D1): missing postgresql.auth.password rendered successfully when postgresql.enabled=true. Chart must refuse." + failed=1 +fi + +# Check 6 — D1 fail-fast on missing external DB URL. +if helm template c "$CHART" \ + --set server.tls.existingSecret=ci \ + --set postgresql.enabled=false \ + --set server.auth.apiKey=k \ + > /dev/null 2> "$TMP/missing-extdb.err"; then + echo "::error file=${CHART}::B3 regression (D1): missing externalDatabase.url rendered successfully when postgresql.enabled=false. Chart must refuse." + failed=1 +fi + +if [ "$failed" -ne 0 ]; then + echo "" + echo "${GUARD_NAME}: FAILED — Helm chart coherence regression." + exit 1 +fi + +echo "${GUARD_NAME}: clean (default + external-Postgres + cert-manager + production hardening + 3 fail-fast gates all green)."