diff --git a/deploy/helm/certctl/templates/servicemonitor.yaml b/deploy/helm/certctl/templates/servicemonitor.yaml index 13ead78..b7abc7c 100644 --- a/deploy/helm/certctl/templates/servicemonitor.yaml +++ b/deploy/helm/certctl/templates/servicemonitor.yaml @@ -42,15 +42,25 @@ spec: interval: {{ .Values.monitoring.serviceMonitor.interval | default "30s" }} scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout | default "10s" }} tlsConfig: - # The certctl server uses self-signed bootstrap TLS or operator- - # provided cert-manager TLS — the ServiceMonitor consumes the - # same CA bundle the server presents. When server.tls.existingSecret - # is set, operators usually want to pull the matching ca.crt key - # out of that Secret. Adjust if your CA chain lives elsewhere. + # Acquisition-audit DEPL-004 closure (Sprint 6 ACQ, 2026-05-16). + # The default flipped from `insecureSkipVerify: true` to a + # fail-closed posture: operators MUST either supply a + # `monitoring.serviceMonitor.tlsConfig` block (caFile / ca / + # serverName for a real TLS verify) or opt back in explicitly + # with `tlsConfig: { insecureSkipVerify: true }`. The {{ required }} + # check below renders an error at `helm template` / `helm upgrade` + # time if neither is supplied, surfacing the misconfiguration + # before the ServiceMonitor lands in-cluster. + # + # In-cluster scrapes from a Prometheus pod that already trusts the + # certctl CA (via existingSecret + cert-manager) keep working with + # zero operator action — they just point at the right caFile. + # Out-of-cluster Prometheus deployments now require the operator + # to surface the trust decision explicitly. {{- if .Values.monitoring.serviceMonitor.tlsConfig }} {{- toYaml .Values.monitoring.serviceMonitor.tlsConfig | nindent 8 }} {{- else }} - insecureSkipVerify: true + {{- required "monitoring.serviceMonitor.tlsConfig is required when monitoring.serviceMonitor.enabled=true (Sprint 6 ACQ DEPL-004 closure, 2026-05-16). Supply { caFile: \"/etc/prometheus/secrets/.../ca.crt\", serverName: \"certctl-server\" } to verify against your CA, or { insecureSkipVerify: true } to opt back into the pre-2026-05-16 default. See docs/operator/helm-deployment.md for the upgrade-path note." nil }} {{- end }} {{- with .Values.monitoring.serviceMonitor.bearerTokenSecret }} bearerTokenSecret: diff --git a/deploy/helm/certctl/values.yaml b/deploy/helm/certctl/values.yaml index d93c09c..28d6828 100644 --- a/deploy/helm/certctl/values.yaml +++ b/deploy/helm/certctl/values.yaml @@ -680,14 +680,28 @@ monitoring: # name: certctl-prometheus-key # key: api-key # bearerTokenSecret: {} - # TLS config for the scrape endpoint. The certctl server presents - # the same TLS cert the rest of the chart uses; insecureSkipVerify - # defaults to true so demos work out of the box. Production deploys - # should pin the CA via caFile or ca.secret. + # TLS config for the scrape endpoint. Acquisition-audit DEPL-004 + # closure (Sprint 6 ACQ, 2026-05-16): the default flipped from + # `insecureSkipVerify: true` to fail-closed. Operators MUST supply + # tlsConfig — either a real verify (caFile / ca / serverName) for + # production, or explicit `{ insecureSkipVerify: true }` to opt + # back into the pre-2026-05-16 default. The ServiceMonitor template + # `{{ required ... }}` guard surfaces missing tlsConfig at chart- + # render time before it lands in-cluster. + # + # In-cluster Prometheus that already trusts the certctl CA via + # the chart's existingSecret / cert-manager-emitted bundle: point + # caFile at that path (typically /etc/prometheus/secrets//ca.crt + # once you mount the Secret into the Prometheus pod). + # + # Production-shaped example (verify against the chart's CA): # tlsConfig: # caFile: /etc/prometheus/secrets/certctl-ca/ca.crt # serverName: certctl-server - # tlsConfig: {} + # + # Demo / dev-cluster escape hatch (operator-acknowledged): + # tlsConfig: + # insecureSkipVerify: true # Optional relabeling for the scrape job. # relabelings: [] diff --git a/docs/operator/helm-deployment.md b/docs/operator/helm-deployment.md index 29b6c23..d83e4ec 100644 --- a/docs/operator/helm-deployment.md +++ b/docs/operator/helm-deployment.md @@ -94,6 +94,31 @@ helm upgrade certctl deploy/helm/certctl/ \ Postgres state survives the upgrade (the PVC is retained). The server / agent images bump per the chart's `image.tag`. See [`docs/archive/upgrades/`](../archive/upgrades/) for version-specific upgrade guidance. +### 2026-05-16 — ServiceMonitor TLS default flipped (DEPL-004) + +Acquisition-audit DEPL-004 closure. `monitoring.serviceMonitor.tlsConfig` was previously empty by default and the chart template fell through to `insecureSkipVerify: true`. Post-2026-05-16, the template emits a `{{ required ... }}` fail-closed message at `helm template` / `helm upgrade` time if neither a real verify nor an explicit opt-back is supplied. + +Operators with `monitoring.serviceMonitor.enabled: true` MUST set one of: + +```yaml +# A. Real TLS verify against the chart's CA (production-shaped). +monitoring: + serviceMonitor: + enabled: true + tlsConfig: + caFile: /etc/prometheus/secrets/certctl-ca/ca.crt + serverName: certctl-server + +# B. Demo / dev-cluster — operator-acknowledged opt-back to pre-flip default. +monitoring: + serviceMonitor: + enabled: true + tlsConfig: + insecureSkipVerify: true +``` + +Operators with `monitoring.serviceMonitor.enabled: false` (the chart default) need no action — the template short-circuits before the `tlsConfig` block. + ## Configuration reference Every value is documented at `deploy/helm/certctl/values.yaml`. Common tweaks: