From 2d83342bbee98ad2c06635ef0c51df7e5b0dcf5f Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Thu, 30 Apr 2026 05:15:05 +0000 Subject: [PATCH] feat(metrics): extend /metrics/prometheus with per-area OCSP counters (Phase 8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production hardening II Phase 8 — surface the OCSP per-event counters shipped in Phase 1+2 through the existing /api/v1/metrics/prometheus endpoint. Operators now alert on certctl_ocsp_counter_total {label="rate_limited"} (Phase 3 trip), {label="nonce_malformed"} (Phase 1 reject), {label="signing_failed"} (issuer connector fails), etc. NEW interface CounterSnapshotter (handler/metrics.go) — minimum surface the Prometheus exposer needs from any per-area counter table: just Snapshot() map[string]uint64. service.OCSPCounters.Snapshot (Phase 1) satisfies it; future per-area counters (CRL, cert-export, EST per-profile, SCEP per-profile, Intune per-profile) plug in the same way as separate SetXxxCounters setters. Naming convention per frozen decision 0.10: certctl__counter_total{label=""} This commit ships only the OCSP block. The remaining areas (CRL, cert-export, EST, SCEP, Intune) plug in via the same SetXxxCounters pattern in follow-up commits — the wire-up cost per area is one new field + one setter + one block of fmt.Fprintf lines. The bundle's S-1 docs-count guard means we don't claim a specific total in prose; operators run `curl /api/v1/metrics/prometheus | grep certctl_` to enumerate. Wired in cmd/server/main.go: a single shared *service.OCSPCounters instance is created once and passed to BOTH the ocspResponseCacheService (so the cache hot path ticks counters) AND metricsHandler.SetOCSPCounters (so the Prometheus exposer reads them). Existing dashboard metrics (certctl_certificate_total, certctl_agent_total, etc.) remain unchanged at the same line offsets — back-compat preserved. Pre-commit verification: go build ./... clean; go test -short -count=1 green for handler/ + service/. The existing TestGetPrometheusMetrics_Success tests still pass (the new counter block is additive at the END of the response body, after the existing dashboard metrics + uptime line). --- cmd/server/main.go | 12 ++++++++- internal/api/handler/metrics.go | 44 +++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/cmd/server/main.go b/cmd/server/main.go index 568583f..0acd40c 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -329,7 +329,12 @@ func main() { // counters get wired in Phase 8 when the Prometheus exposer reads // them. ocspResponseCacheRepo := postgres.NewOCSPResponseCacheRepository(db) - ocspResponseCacheService := service.NewOCSPResponseCacheService(ocspResponseCacheRepo, caOperationsSvc, nil, logger) + // Production hardening II Phase 8: share a single OCSPCounters + // instance between the cache service (Phase 2) and the Prometheus + // exposer (Phase 8) so the metrics endpoint reflects every counter + // tick that happens inside the cache service's hot path. + ocspCounters := service.NewOCSPCounters() + ocspResponseCacheService := service.NewOCSPResponseCacheService(ocspResponseCacheRepo, caOperationsSvc, ocspCounters, logger) caOperationsSvc.SetOCSPCacheSvc(ocspResponseCacheService) // Load-bearing security wire: invalidate the cache after a successful // revocation so the next OCSP fetch returns "revoked" (not the stale @@ -524,6 +529,11 @@ func main() { notificationHandler := handler.NewNotificationHandler(notificationService) statsHandler := handler.NewStatsHandler(statsService) metricsHandler := handler.NewMetricsHandler(statsService, time.Now()) + // Production hardening II Phase 8: wire the per-area counter + // snapshotters so the Prometheus exposer surfaces them. Operators + // alert on certctl_ocsp_counter_total{label="rate_limited"}, + // {label="nonce_malformed"}, etc. + metricsHandler.SetOCSPCounters(ocspCounters) // Bundle-5 / H-006: pass the *sql.DB pool so /ready can probe DB // connectivity via PingContext. /health stays shallow (liveness signal). healthHandler := handler.NewHealthHandler(cfg.Auth.Type, db) diff --git a/internal/api/handler/metrics.go b/internal/api/handler/metrics.go index ccc1909..8b7040a 100644 --- a/internal/api/handler/metrics.go +++ b/internal/api/handler/metrics.go @@ -15,12 +15,27 @@ type MetricsService interface { GetDashboardSummary(ctx context.Context) (interface{}, error) } +// CounterSnapshotter is the minimum surface MetricsHandler consumes +// from a counter table for the Prometheus exposer. The OCSPCounters +// type in internal/service satisfies this; future per-area counter +// tabs (CRL, cert-export, EST, SCEP, Intune) plug in the same way. +// +// Production hardening II Phase 8. +type CounterSnapshotter interface { + Snapshot() map[string]uint64 +} + // MetricsHandler handles HTTP requests for metrics. // Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format // (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc. type MetricsHandler struct { svc MetricsService serverStarted time.Time + // Production hardening II Phase 8 — per-area counter snapshotters. + // nil values omit the corresponding metric block; cmd/server/main.go + // wires the instances at startup. The naming convention is + // certctl__