mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 18:41:30 +00:00
feat(metrics): extend /metrics/prometheus with per-area OCSP counters (Phase 8)
Production hardening II Phase 8 — surface the OCSP per-event counters
shipped in Phase 1+2 through the existing /api/v1/metrics/prometheus
endpoint. Operators now alert on certctl_ocsp_counter_total
{label="rate_limited"} (Phase 3 trip), {label="nonce_malformed"}
(Phase 1 reject), {label="signing_failed"} (issuer connector fails),
etc.
NEW interface CounterSnapshotter (handler/metrics.go) — minimum
surface the Prometheus exposer needs from any per-area counter table:
just Snapshot() map[string]uint64. service.OCSPCounters.Snapshot
(Phase 1) satisfies it; future per-area counters (CRL, cert-export,
EST per-profile, SCEP per-profile, Intune per-profile) plug in the
same way as separate SetXxxCounters setters.
Naming convention per frozen decision 0.10:
certctl_<area>_counter_total{label="<event>"} <value>
This commit ships only the OCSP block. The remaining areas (CRL,
cert-export, EST, SCEP, Intune) plug in via the same
SetXxxCounters pattern in follow-up commits — the wire-up cost per
area is one new field + one setter + one block of fmt.Fprintf lines.
The bundle's S-1 docs-count guard means we don't claim a specific
total in prose; operators run `curl /api/v1/metrics/prometheus | grep
certctl_` to enumerate.
Wired in cmd/server/main.go: a single shared *service.OCSPCounters
instance is created once and passed to BOTH the
ocspResponseCacheService (so the cache hot path ticks counters) AND
metricsHandler.SetOCSPCounters (so the Prometheus exposer reads
them). Existing dashboard metrics (certctl_certificate_total,
certctl_agent_total, etc.) remain unchanged at the same line offsets
— back-compat preserved.
Pre-commit verification: go build ./... clean; go test -short
-count=1 green for handler/ + service/. The existing
TestGetPrometheusMetrics_Success tests still pass (the new counter
block is additive at the END of the response body, after the
existing dashboard metrics + uptime line).
This commit is contained in:
@@ -15,12 +15,27 @@ type MetricsService interface {
|
||||
GetDashboardSummary(ctx context.Context) (interface{}, error)
|
||||
}
|
||||
|
||||
// CounterSnapshotter is the minimum surface MetricsHandler consumes
|
||||
// from a counter table for the Prometheus exposer. The OCSPCounters
|
||||
// type in internal/service satisfies this; future per-area counter
|
||||
// tabs (CRL, cert-export, EST, SCEP, Intune) plug in the same way.
|
||||
//
|
||||
// Production hardening II Phase 8.
|
||||
type CounterSnapshotter interface {
|
||||
Snapshot() map[string]uint64
|
||||
}
|
||||
|
||||
// MetricsHandler handles HTTP requests for metrics.
|
||||
// Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format
|
||||
// (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc.
|
||||
type MetricsHandler struct {
|
||||
svc MetricsService
|
||||
serverStarted time.Time
|
||||
// Production hardening II Phase 8 — per-area counter snapshotters.
|
||||
// nil values omit the corresponding metric block; cmd/server/main.go
|
||||
// wires the instances at startup. The naming convention is
|
||||
// certctl_<area>_<label>_total per frozen decision 0.10.
|
||||
ocspCounters CounterSnapshotter
|
||||
}
|
||||
|
||||
// NewMetricsHandler creates a new MetricsHandler with a service dependency.
|
||||
@@ -32,6 +47,13 @@ func NewMetricsHandler(svc MetricsService, serverStarted time.Time) MetricsHandl
|
||||
}
|
||||
}
|
||||
|
||||
// SetOCSPCounters wires the OCSP counter table for the per-area
|
||||
// metric block in the Prometheus exposition. nil disables the block.
|
||||
// Production hardening II Phase 8.
|
||||
func (h *MetricsHandler) SetOCSPCounters(c CounterSnapshotter) {
|
||||
h.ocspCounters = c
|
||||
}
|
||||
|
||||
// MetricsResponse represents the JSON metrics response for V2.
|
||||
type MetricsResponse struct {
|
||||
Gauge MetricsGauge `json:"gauge"`
|
||||
@@ -222,6 +244,28 @@ func (h MetricsHandler) GetPrometheusMetrics(w http.ResponseWriter, r *http.Requ
|
||||
fmt.Fprintf(w, "# HELP certctl_uptime_seconds Server uptime in seconds.\n")
|
||||
fmt.Fprintf(w, "# TYPE certctl_uptime_seconds gauge\n")
|
||||
fmt.Fprintf(w, "certctl_uptime_seconds %d\n", uptimeSeconds)
|
||||
|
||||
// Production hardening II Phase 8 — per-area counters. Each block
|
||||
// is nil-guarded so a deploy without the wire still produces clean
|
||||
// output (just the legacy dashboard metrics above). Naming
|
||||
// convention: certctl_<area>_<label>_total per frozen decision
|
||||
// 0.10.
|
||||
if h.ocspCounters != nil {
|
||||
fmt.Fprintf(w, "\n# HELP certctl_ocsp_counter_total OCSP responder per-event counters (production hardening II Phase 8).\n")
|
||||
fmt.Fprintf(w, "# TYPE certctl_ocsp_counter_total counter\n")
|
||||
snap := h.ocspCounters.Snapshot()
|
||||
// Emit in a deterministic order so the output diff is stable
|
||||
// across requests (helps operators spot drift in dashboard
|
||||
// snapshots).
|
||||
labels := []string{
|
||||
"request_get", "request_post", "request_success", "request_invalid",
|
||||
"issuer_not_found", "cert_not_found", "signing_failed",
|
||||
"nonce_echoed", "nonce_malformed", "rate_limited",
|
||||
}
|
||||
for _, lbl := range labels {
|
||||
fmt.Fprintf(w, "certctl_ocsp_counter_total{label=%q} %d\n", lbl, snap[lbl])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// DashboardSummary mirrors the service.DashboardSummary for JSON unmarshaling.
|
||||
|
||||
Reference in New Issue
Block a user