From d7546aedcae76509640ff8003724de04074ab73b Mon Sep 17 00:00:00 2001
From: shankar0123 <skreddy040@gmail.com>
Date: Sat, 16 May 2026 19:44:48 +0000
Subject: [PATCH] =?UTF-8?q?fix(helm):=20DEPL-004=20=E2=80=94=20ServiceMoni?=
 =?UTF-8?q?tor=20TLS=20default=20flipped=20to=20fail-closed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Acquisition-audit DEPL-004 closure (Sprint 6 ACQ, 2026-05-16).

Pre-2026-05-16, monitoring.serviceMonitor.tlsConfig in values.yaml
was empty by default, and the ServiceMonitor template fell through
to an implicit `insecureSkipVerify: true` else-branch. Operators
opting into the ServiceMonitor (monitoring.serviceMonitor.enabled=true)
got no Prometheus TLS verification by default — in-cluster scrapes
tolerate this, out-of-cluster scrapes silently skip the chain check.

The template now emits a fail-closed `{{ required ... }}` message
at `helm template` / `helm upgrade` time if neither a real verify
nor an explicit opt-back is supplied. The error string lists both
escape hatches and the docs cross-link, so the operator sees the
fix in the same line they hit the error.

Operators with monitoring.serviceMonitor.enabled=false (the chart
default): no action required — the template short-circuits before
the tlsConfig block. Operators who had ServiceMonitor on with no
tlsConfig set: helm upgrade will fail until they supply either
{ caFile: ..., serverName: ... } (production-shaped) or
{ insecureSkipVerify: true } (operator-acknowledged opt-back).

Files
=====
- deploy/helm/certctl/templates/servicemonitor.yaml: replace the
  else-branch insecureSkipVerify default with a {{ required ... }}
  Helm builtin that fails the render with a clear remediation
  message pointing at both escape hatches and docs/operator/
  helm-deployment.md
- deploy/helm/certctl/values.yaml: rewrite the tlsConfig comment
  block to document the new fail-closed posture + both upgrade
  paths (production verify vs operator-acknowledged opt-back)
- docs/operator/helm-deployment.md: new "2026-05-16 — ServiceMonitor
  TLS default flipped (DEPL-004)" subsection in the existing
  Upgrade section with the two operator-action recipes
---
 .../certctl/templates/servicemonitor.yaml     | 22 +++++++++++-----
 deploy/helm/certctl/values.yaml               | 24 ++++++++++++++----
 docs/operator/helm-deployment.md              | 25 +++++++++++++++++++
 3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/deploy/helm/certctl/templates/servicemonitor.yaml b/deploy/helm/certctl/templates/servicemonitor.yaml
index 13ead78..b7abc7c 100644
--- a/deploy/helm/certctl/templates/servicemonitor.yaml
+++ b/deploy/helm/certctl/templates/servicemonitor.yaml
@@ -42,15 +42,25 @@ spec:
       interval: {{ .Values.monitoring.serviceMonitor.interval | default "30s" }}
       scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout | default "10s" }}
       tlsConfig:
-        # The certctl server uses self-signed bootstrap TLS or operator-
-        # provided cert-manager TLS — the ServiceMonitor consumes the
-        # same CA bundle the server presents. When server.tls.existingSecret
-        # is set, operators usually want to pull the matching ca.crt key
-        # out of that Secret. Adjust if your CA chain lives elsewhere.
+        # Acquisition-audit DEPL-004 closure (Sprint 6 ACQ, 2026-05-16).
+        # The default flipped from `insecureSkipVerify: true` to a
+        # fail-closed posture: operators MUST either supply a
+        # `monitoring.serviceMonitor.tlsConfig` block (caFile / ca /
+        # serverName for a real TLS verify) or opt back in explicitly
+        # with `tlsConfig: { insecureSkipVerify: true }`. The {{ required }}
+        # check below renders an error at `helm template` / `helm upgrade`
+        # time if neither is supplied, surfacing the misconfiguration
+        # before the ServiceMonitor lands in-cluster.
+        #
+        # In-cluster scrapes from a Prometheus pod that already trusts the
+        # certctl CA (via existingSecret + cert-manager) keep working with
+        # zero operator action — they just point at the right caFile.
+        # Out-of-cluster Prometheus deployments now require the operator
+        # to surface the trust decision explicitly.
         {{- if .Values.monitoring.serviceMonitor.tlsConfig }}
         {{- toYaml .Values.monitoring.serviceMonitor.tlsConfig | nindent 8 }}
         {{- else }}
-        insecureSkipVerify: true
+        {{- required "monitoring.serviceMonitor.tlsConfig is required when monitoring.serviceMonitor.enabled=true (Sprint 6 ACQ DEPL-004 closure, 2026-05-16). Supply { caFile: \"/etc/prometheus/secrets/.../ca.crt\", serverName: \"certctl-server\" } to verify against your CA, or { insecureSkipVerify: true } to opt back into the pre-2026-05-16 default. See docs/operator/helm-deployment.md for the upgrade-path note." nil }}
         {{- end }}
       {{- with .Values.monitoring.serviceMonitor.bearerTokenSecret }}
       bearerTokenSecret:
diff --git a/deploy/helm/certctl/values.yaml b/deploy/helm/certctl/values.yaml
index d93c09c..28d6828 100644
--- a/deploy/helm/certctl/values.yaml
+++ b/deploy/helm/certctl/values.yaml
@@ -680,14 +680,28 @@ monitoring:
     #     name: certctl-prometheus-key
     #     key: api-key
     # bearerTokenSecret: {}
-    # TLS config for the scrape endpoint. The certctl server presents
-    # the same TLS cert the rest of the chart uses; insecureSkipVerify
-    # defaults to true so demos work out of the box. Production deploys
-    # should pin the CA via caFile or ca.secret.
+    # TLS config for the scrape endpoint. Acquisition-audit DEPL-004
+    # closure (Sprint 6 ACQ, 2026-05-16): the default flipped from
+    # `insecureSkipVerify: true` to fail-closed. Operators MUST supply
+    # tlsConfig — either a real verify (caFile / ca / serverName) for
+    # production, or explicit `{ insecureSkipVerify: true }` to opt
+    # back into the pre-2026-05-16 default. The ServiceMonitor template
+    # `{{ required ... }}` guard surfaces missing tlsConfig at chart-
+    # render time before it lands in-cluster.
+    #
+    # In-cluster Prometheus that already trusts the certctl CA via
+    # the chart's existingSecret / cert-manager-emitted bundle: point
+    # caFile at that path (typically /etc/prometheus/secrets/<name>/ca.crt
+    # once you mount the Secret into the Prometheus pod).
+    #
+    # Production-shaped example (verify against the chart's CA):
     # tlsConfig:
     #   caFile: /etc/prometheus/secrets/certctl-ca/ca.crt
     #   serverName: certctl-server
-    # tlsConfig: {}
+    #
+    # Demo / dev-cluster escape hatch (operator-acknowledged):
+    # tlsConfig:
+    #   insecureSkipVerify: true
     # Optional relabeling for the scrape job.
     # relabelings: []
 
diff --git a/docs/operator/helm-deployment.md b/docs/operator/helm-deployment.md
index 29b6c23..d83e4ec 100644
--- a/docs/operator/helm-deployment.md
+++ b/docs/operator/helm-deployment.md
@@ -94,6 +94,31 @@ helm upgrade certctl deploy/helm/certctl/ \
 
 Postgres state survives the upgrade (the PVC is retained). The server / agent images bump per the chart's `image.tag`. See [`docs/archive/upgrades/`](../archive/upgrades/) for version-specific upgrade guidance.
 
+### 2026-05-16 — ServiceMonitor TLS default flipped (DEPL-004)
+
+Acquisition-audit DEPL-004 closure. `monitoring.serviceMonitor.tlsConfig` was previously empty by default and the chart template fell through to `insecureSkipVerify: true`. Post-2026-05-16, the template emits a `{{ required ... }}` fail-closed message at `helm template` / `helm upgrade` time if neither a real verify nor an explicit opt-back is supplied.
+
+Operators with `monitoring.serviceMonitor.enabled: true` MUST set one of:
+
+```yaml
+# A. Real TLS verify against the chart's CA (production-shaped).
+monitoring:
+  serviceMonitor:
+    enabled: true
+    tlsConfig:
+      caFile: /etc/prometheus/secrets/certctl-ca/ca.crt
+      serverName: certctl-server
+
+# B. Demo / dev-cluster — operator-acknowledged opt-back to pre-flip default.
+monitoring:
+  serviceMonitor:
+    enabled: true
+    tlsConfig:
+      insecureSkipVerify: true
+```
+
+Operators with `monitoring.serviceMonitor.enabled: false` (the chart default) need no action — the template short-circuits before the `tlsConfig` block.
+
 ## Configuration reference
 
 Every value is documented at `deploy/helm/certctl/values.yaml`. Common tweaks: