feat(metrics): extend /metrics/prometheus with per-area OCSP counters (Phase 8)

Production hardening II Phase 8 — surface the OCSP per-event counters shipped in Phase 1+2 through the existing /api/v1/metrics/prometheus endpoint. Operators now alert on certctl_ocsp_counter_total {label="rate_limited"} (Phase 3 trip), {label="nonce_malformed"} (Phase 1 reject), {label="signing_failed"} (issuer connector fails), etc. NEW interface CounterSnapshotter (handler/metrics.go) — minimum surface the Prometheus exposer needs from any per-area counter table: just Snapshot() map[string]uint64. service.OCSPCounters.Snapshot (Phase 1) satisfies it; future per-area counters (CRL, cert-export, EST per-profile, SCEP per-profile, Intune per-profile) plug in the same way as separate SetXxxCounters setters. Naming convention per frozen decision 0.10: certctl_<area>_counter_total{label="<event>"} <value> This commit ships only the OCSP block. The remaining areas (CRL, cert-export, EST, SCEP, Intune) plug in via the same SetXxxCounters pattern in follow-up commits — the wire-up cost per area is one new field + one setter + one block of fmt.Fprintf lines. The bundle's S-1 docs-count guard means we don't claim a specific total in prose; operators run `curl /api/v1/metrics/prometheus | grep certctl_` to enumerate. Wired in cmd/server/main.go: a single shared *service.OCSPCounters instance is created once and passed to BOTH the ocspResponseCacheService (so the cache hot path ticks counters) AND metricsHandler.SetOCSPCounters (so the Prometheus exposer reads them). Existing dashboard metrics (certctl_certificate_total, certctl_agent_total, etc.) remain unchanged at the same line offsets — back-compat preserved. Pre-commit verification: go build ./... clean; go test -short -count=1 green for handler/ + service/. The existing TestGetPrometheusMetrics_Success tests still pass (the new counter block is additive at the END of the response body, after the existing dashboard metrics + uptime line).
2026-08-06 11:07:48 +00:00 · 2026-04-30 05:15:05 +00:00
parent 8cba794723
commit 2d83342bbe
2 changed files with 55 additions and 1 deletions
@@ -15,12 +15,27 @@ type MetricsService interface {
 	GetDashboardSummary(ctx context.Context) (interface{}, error)
 }

+// CounterSnapshotter is the minimum surface MetricsHandler consumes
+// from a counter table for the Prometheus exposer. The OCSPCounters
+// type in internal/service satisfies this; future per-area counter
+// tabs (CRL, cert-export, EST, SCEP, Intune) plug in the same way.
+//
+// Production hardening II Phase 8.
+type CounterSnapshotter interface {
+	Snapshot() map[string]uint64
+}
+
 // MetricsHandler handles HTTP requests for metrics.
 // Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format
 // (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc.
 type MetricsHandler struct {
 	svc           MetricsService
 	serverStarted time.Time
+	// Production hardening II Phase 8 — per-area counter snapshotters.
+	// nil values omit the corresponding metric block; cmd/server/main.go
+	// wires the instances at startup. The naming convention is
+	// certctl_<area>_<label>_total per frozen decision 0.10.
+	ocspCounters CounterSnapshotter
 }

 // NewMetricsHandler creates a new MetricsHandler with a service dependency.
@@ -32,6 +47,13 @@ func NewMetricsHandler(svc MetricsService, serverStarted time.Time) MetricsHandl
 	}
 }

+// SetOCSPCounters wires the OCSP counter table for the per-area
+// metric block in the Prometheus exposition. nil disables the block.
+// Production hardening II Phase 8.
+func (h *MetricsHandler) SetOCSPCounters(c CounterSnapshotter) {
+	h.ocspCounters = c
+}
+
 // MetricsResponse represents the JSON metrics response for V2.
 type MetricsResponse struct {
 	Gauge   MetricsGauge   `json:"gauge"`
@@ -222,6 +244,28 @@ func (h MetricsHandler) GetPrometheusMetrics(w http.ResponseWriter, r *http.Requ
 	fmt.Fprintf(w, "# HELP certctl_uptime_seconds Server uptime in seconds.\n")
 	fmt.Fprintf(w, "# TYPE certctl_uptime_seconds gauge\n")
 	fmt.Fprintf(w, "certctl_uptime_seconds %d\n", uptimeSeconds)
+
+	// Production hardening II Phase 8 — per-area counters. Each block
+	// is nil-guarded so a deploy without the wire still produces clean
+	// output (just the legacy dashboard metrics above). Naming
+	// convention: certctl_<area>_<label>_total per frozen decision
+	// 0.10.
+	if h.ocspCounters != nil {
+		fmt.Fprintf(w, "\n# HELP certctl_ocsp_counter_total OCSP responder per-event counters (production hardening II Phase 8).\n")
+		fmt.Fprintf(w, "# TYPE certctl_ocsp_counter_total counter\n")
+		snap := h.ocspCounters.Snapshot()
+		// Emit in a deterministic order so the output diff is stable
+		// across requests (helps operators spot drift in dashboard
+		// snapshots).
+		labels := []string{
+			"request_get", "request_post", "request_success", "request_invalid",
+			"issuer_not_found", "cert_not_found", "signing_failed",
+			"nonce_echoed", "nonce_malformed", "rate_limited",
+		}
+		for _, lbl := range labels {
+			fmt.Fprintf(w, "certctl_ocsp_counter_total{label=%q} %d\n", lbl, snap[lbl])
+		}
+	}
 }

 // DashboardSummary mirrors the service.DashboardSummary for JSON unmarshaling.