From 95c2bf98188474f3e54e448eb3d87ac72d7ec441 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Sat, 2 May 2026 00:39:25 +0000 Subject: [PATCH] metrics: add per-issuer-type issuance counters, histogram, and failure classifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the #4 acquisition-readiness blocker from the 2026-05-01 issuer coverage audit. Before this commit, certctl's Prometheus exposition had zero per-issuer-type signal — operators answering "is DigiCert slow?" or "is Sectigo failing more than ACME?" had to grep logs by issuer name. This commit adds three series labelled by issuer type: certctl_issuance_total{issuer_type, outcome} certctl_issuance_duration_seconds{issuer_type} (histogram) certctl_issuance_failures_total{issuer_type, error_class} The histogram covers 0.05–120 second buckets to span the local-issuer fast path and async-CA slow path (DigiCert/Sectigo/Entrust polling can take minutes). error_class is a closed enum of eight values (timeout, auth, rate_limited, validation, upstream_5xx, upstream_4xx, network, other) classified once in service.ClassifyError. Cardinality budget is ~276 new series, well within Prometheus's comfortable range. Implementation: - service.IssuanceMetrics is the thread-safe counter + histogram table. Three independent views (counters / failures / durations) exposed via SnapshotCounters / SnapshotFailures / SnapshotDurations. sync.RWMutex protects the map shape; per-key sync/atomic.Uint64 primitives keep the recording hot path lock-free under concurrent service-layer goroutines. - service.IssuanceCounterEntry / IssuanceFailureEntry / IssuanceDurationEntry / IssuanceMetricsSnapshotter live in service (not handler) to avoid an import cycle: handler already imports service for admin_est.go etc., so service can't import handler back. Handler's exposer takes the snapshotter via the service-defined interface. - service.ClassifyError pure function maps error → error_class. context.DeadlineExceeded / context.Canceled → timeout; *net.OpError → network; substring matches against canonical AWS / DigiCert / Sectigo error shapes for auth / rate_limited / validation / upstream_5xx / upstream_4xx / network; unknown → other. Each branch has at least one representative test case in TestClassifyError. - IssuerConnectorAdapter.SetMetrics wires per-adapter recording (issuerType + metrics). Existing 28+ test call sites of NewIssuerConnectorAdapter keep their one-arg signature; production wiring goes through SetMetrics post-construction. - IssuerRegistry.SetIssuanceMetrics + Rebuild type-asserts to *IssuerConnectorAdapter and calls SetMetrics with the issuer type string. nil-guarded — tests that hand-build adapters without metrics get no-op recording. - IssuerConnectorAdapter.IssueCertificate / RenewCertificate wrap the underlying connector call with start := time.Now() and recordIssuance(start, err). Renewal is recorded into the same certctl_issuance_* series as initial issuance — operationally, renewal IS issuance from the connector's perspective (matches the audit prompt's guidance on series naming). - handler/metrics.go GetPrometheusMetrics gains a new exposer block emitting all three series in stable label order with correct Prometheus format (_bucket / _sum / _count for the histogram, +Inf bucket appended). Sorted via sort.Slice for stable output. nil- guarded so deploys without the wire produce clean exposition. - formatLE helper trims trailing zeros from histogram bucket labels via strconv.FormatFloat(le, 'f', -1, 64) so the `le` labels match Prometheus client conventions ("0.05", "30", "120", not "0.0500" etc.). - cmd/server/main.go wires a single IssuanceMetrics instance into both the IssuerRegistry (recording) and the MetricsHandler (exposer) using DefaultIssuanceBucketBoundaries. Tests: - TestIssuanceMetrics_RecordAndSnapshot — happy-path counter + histogram + failure recording, BucketBoundaries returns a copy (not shared storage). - TestIssuanceMetrics_HistogramCumulative — pins the cumulative-buckets contract. 100ms observation lands in 0.1 bucket and every larger bucket; 750ms only in the 1.0 bucket. Off-by-one here would corrupt every quantile query downstream. - TestIssuanceMetrics_Concurrency — 100 goroutines × 1000 ops under the race detector. Asserts atomic counter integrity across contended writes. - TestClassifyError — 17 cases covering every branch of the closed enum plus the nil-error special case. Implementation chooses the existing hand-rolled fmt.Fprintf exposition pattern (no prometheus/client_golang dependency added) to stay consistent with the OCSP / deploy counter blocks already in the file. Out of scope (separate follow-ups): - Revocation metrics (certctl_revocation_*) — symmetric to issuance but the audit didn't ask; explicit follow-up commit. - Discovery / health-check duration histograms. - prometheus/client_golang migration. Verified locally: - gofmt clean - go vet ./... clean - staticcheck ./... clean - golangci-lint run --timeout 5m ./... → 0 issues - go test -short -count=1 ./internal/service/ green - go test -short -count=1 -race -run TestIssuanceMetrics ./internal/service/ green - go test -short -count=1 ./internal/api/handler/ green - go build ./... success Audit reference: cowork/issuer-coverage-audit-2026-05-01/RESULTS.md Top-10 fix #4 (Part 3, narrative section). --- cmd/server/main.go | 10 + internal/api/handler/metrics.go | 88 ++++++ internal/service/issuance_metrics.go | 366 ++++++++++++++++++++++ internal/service/issuance_metrics_test.go | 194 ++++++++++++ internal/service/issuer_adapter.go | 54 +++- internal/service/issuer_registry.go | 30 +- 6 files changed, 738 insertions(+), 4 deletions(-) create mode 100644 internal/service/issuance_metrics.go create mode 100644 internal/service/issuance_metrics_test.go diff --git a/cmd/server/main.go b/cmd/server/main.go index a4bb109..08c6abe 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -215,6 +215,13 @@ func main() { } issuerRegistry := service.NewIssuerRegistry(logger) + // Per-issuer-type issuance metrics (audit fix #4: closes the + // per-issuer-type observability gap). Same instance is wired into + // the registry (so adapters record issuance/renewal calls) AND + // into the metrics handler (so the Prometheus exposer emits + // certctl_issuance_total / _duration_seconds / _failures_total). + issuanceMetrics := service.NewIssuanceMetrics(service.DefaultIssuanceBucketBoundaries) + issuerRegistry.SetIssuanceMetrics(issuanceMetrics) // Initialize revocation repository revocationRepo := postgres.NewRevocationRepository(db) @@ -544,6 +551,9 @@ func main() { // alert on certctl_ocsp_counter_total{label="rate_limited"}, // {label="nonce_malformed"}, etc. metricsHandler.SetOCSPCounters(ocspCounters) + // Audit fix #4: wire the per-issuer-type issuance metrics so the + // /api/v1/metrics/prometheus exposer emits the new series. + metricsHandler.SetIssuanceCounters(issuanceMetrics) // Bundle-5 / H-006: pass the *sql.DB pool so /ready can probe DB // connectivity via PingContext. /health stays shallow (liveness signal). healthHandler := handler.NewHealthHandler(cfg.Auth.Type, db) diff --git a/internal/api/handler/metrics.go b/internal/api/handler/metrics.go index cf58671..0075cab 100644 --- a/internal/api/handler/metrics.go +++ b/internal/api/handler/metrics.go @@ -6,9 +6,11 @@ import ( "fmt" "net/http" "sort" + "strconv" "time" "github.com/shankar0123/certctl/internal/api/middleware" + "github.com/shankar0123/certctl/internal/service" ) // MetricsService defines the service interface for metrics collection. @@ -51,6 +53,13 @@ type DeployCounterSnapshotter interface { Snapshot() []DeploySnapshotEntry } +// IssuanceCounterEntry / IssuanceFailureEntry / IssuanceDurationEntry +// and the IssuanceMetricsSnapshotter interface live in +// internal/service (issuance_metrics.go). Handler can't define them +// locally because internal/api/handler is imported by service — the +// reverse import would create a cycle. The exposer below takes the +// types via the interface defined in service. + // MetricsHandler handles HTTP requests for metrics. // Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format // (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc. @@ -64,6 +73,12 @@ type MetricsHandler struct { ocspCounters CounterSnapshotter // Phase 10 (deploy-hardening I) — per-target-type deploy counters. deployCounters DeployCounterSnapshotter + // Per-issuer-type issuance metrics (audit fix #4). nil disables + // the new metric block; main.go wires the instance at startup. + // The interface lives in service to avoid an import cycle (handler + // imports service for admin_est.go etc., so service can't import + // handler back). + issuanceCounters service.IssuanceMetricsSnapshotter } // NewMetricsHandler creates a new MetricsHandler with a service dependency. @@ -89,6 +104,14 @@ func (h *MetricsHandler) SetDeployCounters(c DeployCounterSnapshotter) { h.deployCounters = c } +// SetIssuanceCounters wires the per-issuer-type issuance metrics for +// the Prometheus exposition. nil disables the block. Closes the #4 +// acquisition-readiness blocker from the 2026-05-01 issuer coverage +// audit (per-issuer-type metrics). +func (h *MetricsHandler) SetIssuanceCounters(c service.IssuanceMetricsSnapshotter) { + h.issuanceCounters = c +} + // MetricsResponse represents the JSON metrics response for V2. type MetricsResponse struct { Gauge MetricsGauge `json:"gauge"` @@ -344,6 +367,71 @@ func (h MetricsHandler) GetPrometheusMetrics(w http.ResponseWriter, r *http.Requ fmt.Fprintf(w, "certctl_deploy_idempotent_skip_total{target_type=%q} %d\n", s.TargetType, s.IdempotentSkips) } } + + // Per-issuer-type issuance metrics (audit fix #4). Three series: + // certctl_issuance_total{issuer_type, outcome} counter + // certctl_issuance_duration_seconds{issuer_type} histogram + // certctl_issuance_failures_total{issuer_type, error_class} counter + // + // Cardinality: 12 issuer_types × 2 outcomes (24) + + // 12 × 11 buckets+sum+count (~156) + + // 12 × 8 error_classes (96) = ~276 series. Comfortable + // for any Prometheus instance. + if h.issuanceCounters != nil { + // certctl_issuance_total + fmt.Fprintf(w, "\n# HELP certctl_issuance_total Total certificate issuance attempts, labelled by issuer type and outcome.\n") + fmt.Fprintf(w, "# TYPE certctl_issuance_total counter\n") + counters := h.issuanceCounters.SnapshotCounters() + sort.Slice(counters, func(i, j int) bool { + if counters[i].IssuerType != counters[j].IssuerType { + return counters[i].IssuerType < counters[j].IssuerType + } + return counters[i].Outcome < counters[j].Outcome + }) + for _, c := range counters { + fmt.Fprintf(w, "certctl_issuance_total{issuer_type=%q,outcome=%q} %d\n", c.IssuerType, c.Outcome, c.Count) + } + + // certctl_issuance_duration_seconds histogram + fmt.Fprintf(w, "\n# HELP certctl_issuance_duration_seconds Certificate issuance duration in seconds, labelled by issuer type. Cumulative histogram with +Inf.\n") + fmt.Fprintf(w, "# TYPE certctl_issuance_duration_seconds histogram\n") + durations := h.issuanceCounters.SnapshotDurations() + boundaries := h.issuanceCounters.BucketBoundaries() + sort.Slice(durations, func(i, j int) bool { return durations[i].IssuerType < durations[j].IssuerType }) + for _, d := range durations { + for i, le := range boundaries { + if i < len(d.Buckets) { + fmt.Fprintf(w, "certctl_issuance_duration_seconds_bucket{issuer_type=%q,le=%q} %d\n", + d.IssuerType, formatLE(le), d.Buckets[i]) + } + } + fmt.Fprintf(w, "certctl_issuance_duration_seconds_bucket{issuer_type=%q,le=\"+Inf\"} %d\n", d.IssuerType, d.Count) + fmt.Fprintf(w, "certctl_issuance_duration_seconds_sum{issuer_type=%q} %g\n", d.IssuerType, d.Sum) + fmt.Fprintf(w, "certctl_issuance_duration_seconds_count{issuer_type=%q} %d\n", d.IssuerType, d.Count) + } + + // certctl_issuance_failures_total + fmt.Fprintf(w, "\n# HELP certctl_issuance_failures_total Issuance failures by issuer type and error class. error_class is a closed enum (timeout, auth, rate_limited, validation, upstream_5xx, upstream_4xx, network, other).\n") + fmt.Fprintf(w, "# TYPE certctl_issuance_failures_total counter\n") + failures := h.issuanceCounters.SnapshotFailures() + sort.Slice(failures, func(i, j int) bool { + if failures[i].IssuerType != failures[j].IssuerType { + return failures[i].IssuerType < failures[j].IssuerType + } + return failures[i].ErrorClass < failures[j].ErrorClass + }) + for _, f := range failures { + fmt.Fprintf(w, "certctl_issuance_failures_total{issuer_type=%q,error_class=%q} %d\n", f.IssuerType, f.ErrorClass, f.Count) + } + } +} + +// formatLE formats a histogram bucket boundary the way Prometheus +// expects: no trailing zeros, no scientific notation for typical +// sub-second / sub-minute values. Used for the `le` label in the +// issuance-duration histogram exposer. +func formatLE(v float64) string { + return strconv.FormatFloat(v, 'f', -1, 64) } // DashboardSummary mirrors the service.DashboardSummary for JSON unmarshaling. diff --git a/internal/service/issuance_metrics.go b/internal/service/issuance_metrics.go new file mode 100644 index 0000000..63c0590 --- /dev/null +++ b/internal/service/issuance_metrics.go @@ -0,0 +1,366 @@ +// Copyright (c) certctl +// SPDX-License-Identifier: BSL-1.1 + +package service + +import ( + "context" + "errors" + "net" + "strings" + "sync" + "sync/atomic" + "time" +) + +// IssuanceCounterEntry is one (issuer_type, outcome, count) tuple +// emitted by the per-issuer-type issuance counter table. Closes the +// #4 acquisition-readiness blocker from the 2026-05-01 issuer coverage +// audit (per-issuer-type metrics). +type IssuanceCounterEntry struct { + IssuerType string + Outcome string // "success" | "failure" + Count uint64 +} + +// IssuanceFailureEntry is one (issuer_type, error_class, count) tuple +// emitted by the issuance-failure counter table. error_class is a +// closed enum of eight values (timeout, auth, rate_limited, +// validation, upstream_5xx, upstream_4xx, network, other) — cardinality +// discipline keeps this metric tractable. +type IssuanceFailureEntry struct { + IssuerType string + ErrorClass string + Count uint64 +} + +// IssuanceDurationEntry is one (issuer_type, bucket-counts, sum, count) +// tuple emitted by the issuance-duration histogram. Buckets carries +// cumulative counts in the order matching the BucketBoundaries +// reported by the snapshotter; Sum is total observed seconds; Count +// is total observations (matches the +Inf bucket). +type IssuanceDurationEntry struct { + IssuerType string + Buckets []uint64 + Sum float64 + Count uint64 +} + +// IssuanceMetricsSnapshotter is the surface MetricsHandler consumes +// for per-issuer-type issuance metrics. The handler imports this +// interface so the snapshot types stay in the service package +// (avoids an import cycle: handler imports service for the +// admin_est / admin_scep_intune handlers, so the reverse direction +// can't import handler). +// +// *IssuanceMetrics satisfies this interface; the production wiring +// in cmd/server/main.go passes the same instance into both the +// IssuerRegistry (for adapter-side recording) and the MetricsHandler +// (for Prometheus exposition). +type IssuanceMetricsSnapshotter interface { + SnapshotCounters() []IssuanceCounterEntry + SnapshotFailures() []IssuanceFailureEntry + SnapshotDurations() []IssuanceDurationEntry + BucketBoundaries() []float64 +} + +// DefaultIssuanceBucketBoundaries covers the local-issuer fast path +// (sub-100ms signing) through the async-CA slow path (DigiCert / +// Sectigo / Entrust polling can take minutes). The +Inf bucket is +// appended by the Prometheus exposer; we don't include it here. +// +// Boundaries chosen for operator alerting: 0.05s catches when the +// local issuer's signer has gone non-cooperative; 30s catches when +// an async CA is slow but not stuck; 120s catches when polling has +// effectively stalled. +var DefaultIssuanceBucketBoundaries = []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120} + +// IssuanceMetrics is a thread-safe in-memory counter + histogram table +// for per-issuer-type issuance signals. Closes the #4 acquisition- +// readiness blocker from the 2026-05-01 issuer coverage audit +// (per-issuer-type metrics). +// +// Three independent views — counter, failures, durations — are exposed +// via the Snapshot* methods so handler.IssuanceMetricsSnapshotter is +// satisfied. +// +// Cardinality is bounded by: +// - Closed enum of issuer types (12 currently) +// - "success" / "failure" outcome strings (2) +// - 8-value error_class enum (timeout, auth, rate_limited, +// validation, upstream_5xx, upstream_4xx, network, other) +// - Fixed bucket boundaries (11 + implicit +Inf in exposer) +// +// Underlying maps grow to a fixed upper bound and stop. A new issuer +// type appears once and never explodes the cardinality. +type IssuanceMetrics struct { + bucketBoundaries []float64 + + mu sync.RWMutex + counters map[counterKey]*atomic.Uint64 + failures map[failureKey]*atomic.Uint64 + durations map[string]*durationState // key: issuer_type +} + +type counterKey struct{ IssuerType, Outcome string } +type failureKey struct{ IssuerType, ErrorClass string } + +type durationState struct { + buckets []atomic.Uint64 + // sumMillis stores the sum in milliseconds (uint64-encoded) so we + // can use atomic adds; the snapshot converts back to float seconds. + sumMillis atomic.Uint64 + count atomic.Uint64 +} + +// NewIssuanceMetrics constructs a fresh IssuanceMetrics with the given +// bucket boundaries. Pass DefaultIssuanceBucketBoundaries unless tests +// need a different shape. +func NewIssuanceMetrics(buckets []float64) *IssuanceMetrics { + cp := make([]float64, len(buckets)) + copy(cp, buckets) + return &IssuanceMetrics{ + bucketBoundaries: cp, + counters: make(map[counterKey]*atomic.Uint64), + failures: make(map[failureKey]*atomic.Uint64), + durations: make(map[string]*durationState), + } +} + +// RecordIssuance bumps the (issuer_type, outcome) counter and observes +// the duration into the (issuer_type) histogram. outcome is +// "success" or "failure"; pass "" only if you intend to record neither +// (the call returns without effect). +func (m *IssuanceMetrics) RecordIssuance(issuerType, outcome string, duration time.Duration) { + if issuerType == "" || outcome == "" { + return + } + m.bumpCounter(counterKey{IssuerType: issuerType, Outcome: outcome}) + m.observeDuration(issuerType, duration) +} + +// RecordFailure bumps the (issuer_type, error_class) failure counter. +// Caller is responsible for classifying the error via ClassifyError; +// passing an off-enum value will silently grow the cardinality +// (closed-enum discipline is the caller's contract). +func (m *IssuanceMetrics) RecordFailure(issuerType, errorClass string) { + if issuerType == "" || errorClass == "" { + return + } + m.bumpFailure(failureKey{IssuerType: issuerType, ErrorClass: errorClass}) +} + +func (m *IssuanceMetrics) bumpCounter(k counterKey) { + m.mu.RLock() + c, ok := m.counters[k] + m.mu.RUnlock() + if !ok { + m.mu.Lock() + c, ok = m.counters[k] + if !ok { + c = new(atomic.Uint64) + m.counters[k] = c + } + m.mu.Unlock() + } + c.Add(1) +} + +func (m *IssuanceMetrics) bumpFailure(k failureKey) { + m.mu.RLock() + c, ok := m.failures[k] + m.mu.RUnlock() + if !ok { + m.mu.Lock() + c, ok = m.failures[k] + if !ok { + c = new(atomic.Uint64) + m.failures[k] = c + } + m.mu.Unlock() + } + c.Add(1) +} + +func (m *IssuanceMetrics) observeDuration(issuerType string, duration time.Duration) { + m.mu.RLock() + state, ok := m.durations[issuerType] + m.mu.RUnlock() + if !ok { + m.mu.Lock() + state, ok = m.durations[issuerType] + if !ok { + state = &durationState{ + buckets: make([]atomic.Uint64, len(m.bucketBoundaries)), + } + m.durations[issuerType] = state + } + m.mu.Unlock() + } + + seconds := duration.Seconds() + // Cumulative buckets: bump every bucket whose boundary >= seconds. + for i, le := range m.bucketBoundaries { + if seconds <= le { + state.buckets[i].Add(1) + } + } + // sumMillis: store the duration in milliseconds (uint64) to keep + // atomic. Snapshot converts back to seconds. + state.sumMillis.Add(uint64(duration.Milliseconds())) + state.count.Add(1) +} + +// SnapshotCounters returns a stable copy of the (issuer_type, outcome, +// count) tuples. Safe to call concurrently with RecordIssuance. +func (m *IssuanceMetrics) SnapshotCounters() []IssuanceCounterEntry { + m.mu.RLock() + defer m.mu.RUnlock() + out := make([]IssuanceCounterEntry, 0, len(m.counters)) + for k, v := range m.counters { + out = append(out, IssuanceCounterEntry{ + IssuerType: k.IssuerType, + Outcome: k.Outcome, + Count: v.Load(), + }) + } + return out +} + +// SnapshotFailures returns a stable copy of the (issuer_type, +// error_class, count) tuples. Safe to call concurrently. +func (m *IssuanceMetrics) SnapshotFailures() []IssuanceFailureEntry { + m.mu.RLock() + defer m.mu.RUnlock() + out := make([]IssuanceFailureEntry, 0, len(m.failures)) + for k, v := range m.failures { + out = append(out, IssuanceFailureEntry{ + IssuerType: k.IssuerType, + ErrorClass: k.ErrorClass, + Count: v.Load(), + }) + } + return out +} + +// SnapshotDurations returns a stable copy of the (issuer_type, buckets, +// sum, count) tuples. The buckets slice is in the order matching +// BucketBoundaries(); sum is in seconds. Safe to call concurrently. +func (m *IssuanceMetrics) SnapshotDurations() []IssuanceDurationEntry { + m.mu.RLock() + defer m.mu.RUnlock() + out := make([]IssuanceDurationEntry, 0, len(m.durations)) + for issuerType, state := range m.durations { + buckets := make([]uint64, len(state.buckets)) + for i := range state.buckets { + buckets[i] = state.buckets[i].Load() + } + out = append(out, IssuanceDurationEntry{ + IssuerType: issuerType, + Buckets: buckets, + Sum: float64(state.sumMillis.Load()) / 1000.0, + Count: state.count.Load(), + }) + } + return out +} + +// BucketBoundaries returns a copy of the bucket boundaries used by +// this IssuanceMetrics. Used by the Prometheus exposer to label the +// histogram buckets. +func (m *IssuanceMetrics) BucketBoundaries() []float64 { + out := make([]float64, len(m.bucketBoundaries)) + copy(out, m.bucketBoundaries) + return out +} + +// Compile-time guard: *IssuanceMetrics satisfies +// IssuanceMetricsSnapshotter. +var _ IssuanceMetricsSnapshotter = (*IssuanceMetrics)(nil) + +// ClassifyError maps an arbitrary error to one of eight closed-enum +// error_class values. The classification is deterministic and runs in +// constant time (no regex compilation, no reflection beyond +// errors.Is / errors.As). +// +// Closed enum: timeout, auth, rate_limited, validation, upstream_5xx, +// upstream_4xx, network, other. Adding a ninth value is a deliberate +// change that requires updating the docs/metrics.md enum list and +// any operator alerting rules that pin specific labels — do NOT +// expand the enum casually; classify edge cases as "other" and +// document the case if it matters. +func ClassifyError(err error) string { + if err == nil { + return "" // caller should not invoke us with nil + } + + // 1. Context deadline / cancellation → timeout (the operator + // alerts on slow upstream CAs via this label). + if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { + return "timeout" + } + + // 2. Network-layer errors (connection refused, DNS, TLS handshake) + // → network. Detected via *net.OpError or strings the stdlib + // uses for these conditions. + var opErr *net.OpError + if errors.As(err, &opErr) { + return "network" + } + + msg := strings.ToLower(err.Error()) + + // 3. Substring matches against the most common upstream-CA error + // shapes. Order matters — auth and rate-limited need to win + // over generic 4xx, and 5xx needs to win over generic + // "internal" matches. + switch { + case strings.Contains(msg, "deadline exceeded"), + strings.Contains(msg, "timeout"), + strings.Contains(msg, "i/o timeout"): + return "timeout" + case strings.Contains(msg, "401"), + strings.Contains(msg, "unauthorized"), + strings.Contains(msg, "accessdenied"), + strings.Contains(msg, "access denied"), + strings.Contains(msg, "forbidden"): + return "auth" + case strings.Contains(msg, "429"), + strings.Contains(msg, "ratelimit"), + strings.Contains(msg, "rate limit"), + strings.Contains(msg, "throttl"): + return "rate_limited" + case strings.Contains(msg, "csr"), + strings.Contains(msg, "validate"), + strings.Contains(msg, "validation"), + strings.Contains(msg, "invalid"), + strings.Contains(msg, "malformed"): + return "validation" + case strings.Contains(msg, "500"), + strings.Contains(msg, "502"), + strings.Contains(msg, "503"), + strings.Contains(msg, "504"), + strings.Contains(msg, "5xx"), + strings.Contains(msg, "serviceunavailable"), + strings.Contains(msg, "service unavailable"), + strings.Contains(msg, "internalerror"), + strings.Contains(msg, "internal server error"): + return "upstream_5xx" + case strings.Contains(msg, "404"), + strings.Contains(msg, "400"), + strings.Contains(msg, "4xx"), + strings.Contains(msg, "notfound"), + strings.Contains(msg, "not found"), + strings.Contains(msg, "badrequest"), + strings.Contains(msg, "bad request"): + return "upstream_4xx" + case strings.Contains(msg, "no such host"), + strings.Contains(msg, "connection refused"), + strings.Contains(msg, "tls handshake"), + strings.Contains(msg, "network"), + strings.Contains(msg, "dial tcp"), + strings.Contains(msg, "broken pipe"): + return "network" + } + return "other" +} diff --git a/internal/service/issuance_metrics_test.go b/internal/service/issuance_metrics_test.go new file mode 100644 index 0000000..5476ec2 --- /dev/null +++ b/internal/service/issuance_metrics_test.go @@ -0,0 +1,194 @@ +// Copyright (c) certctl +// SPDX-License-Identifier: BSL-1.1 + +package service + +import ( + "context" + "errors" + "net" + "sync" + "testing" + "time" +) + +// TestIssuanceMetrics_RecordAndSnapshot exercises the happy-path +// counter + histogram + failure recording. Asserts: +// - SnapshotCounters returns the expected (issuer_type, outcome, count) tuples +// - SnapshotDurations returns cumulative bucket counts +// - SnapshotFailures returns the expected (issuer_type, error_class, count) tuples +// - BucketBoundaries returns a copy that doesn't share backing storage +func TestIssuanceMetrics_RecordAndSnapshot(t *testing.T) { + m := NewIssuanceMetrics(DefaultIssuanceBucketBoundaries) + + // Record three issuances: two success (one fast, one slow), one failure. + m.RecordIssuance("local", "success", 50*time.Millisecond) // 0.05 bucket + m.RecordIssuance("local", "success", 2*time.Second) // 2.5 bucket + m.RecordIssuance("digicert", "failure", 90*time.Second) // 120 bucket + m.RecordFailure("digicert", "rate_limited") + + counters := m.SnapshotCounters() + if len(counters) != 2 { + t.Fatalf("expected 2 counter entries, got %d", len(counters)) + } + for _, c := range counters { + switch { + case c.IssuerType == "local" && c.Outcome == "success": + if c.Count != 2 { + t.Errorf("local/success: want 2, got %d", c.Count) + } + case c.IssuerType == "digicert" && c.Outcome == "failure": + if c.Count != 1 { + t.Errorf("digicert/failure: want 1, got %d", c.Count) + } + default: + t.Errorf("unexpected counter entry: %+v", c) + } + } + + failures := m.SnapshotFailures() + if len(failures) != 1 { + t.Fatalf("expected 1 failure entry, got %d", len(failures)) + } + if failures[0].IssuerType != "digicert" || failures[0].ErrorClass != "rate_limited" || failures[0].Count != 1 { + t.Errorf("unexpected failure entry: %+v", failures[0]) + } + + durations := m.SnapshotDurations() + if len(durations) != 2 { + t.Fatalf("expected 2 duration entries, got %d", len(durations)) + } + + // BucketBoundaries: returned slice must be a copy. + b1 := m.BucketBoundaries() + b2 := m.BucketBoundaries() + if &b1[0] == &b2[0] { + t.Error("BucketBoundaries should return a copy, not shared storage") + } +} + +// TestIssuanceMetrics_HistogramCumulative pins the cumulative-buckets +// contract. Prometheus histograms require buckets to be cumulative — +// `le=0.5` includes everything <= 0.5, including <= 0.05 and <= 0.1. +// Off-by-one here corrupts every quantile query downstream. +func TestIssuanceMetrics_HistogramCumulative(t *testing.T) { + m := NewIssuanceMetrics([]float64{0.1, 0.5, 1.0}) + + // Observe 100ms (= 0.1s exactly). + m.RecordIssuance("local", "success", 100*time.Millisecond) + + durs := m.SnapshotDurations() + if len(durs) != 1 { + t.Fatalf("expected 1 duration entry, got %d", len(durs)) + } + + // Boundaries: [0.1, 0.5, 1.0]. 100ms falls into 0.1 bucket and + // every larger bucket (cumulative). Sum = 0.1, count = 1. + want := []uint64{1, 1, 1} + for i, w := range want { + if durs[0].Buckets[i] != w { + t.Errorf("bucket[%d]: want %d, got %d", i, w, durs[0].Buckets[i]) + } + } + if durs[0].Sum < 0.099 || durs[0].Sum > 0.101 { + t.Errorf("sum: want ~0.1, got %v", durs[0].Sum) + } + if durs[0].Count != 1 { + t.Errorf("count: want 1, got %d", durs[0].Count) + } + + // Observe 750ms — falls into 1.0 bucket only (>0.1, >0.5). + m.RecordIssuance("local", "success", 750*time.Millisecond) + + durs = m.SnapshotDurations() + want = []uint64{1, 1, 2} // 100ms in all 3, 750ms in only the 1.0 bucket + for i, w := range want { + if durs[0].Buckets[i] != w { + t.Errorf("after 750ms — bucket[%d]: want %d, got %d", i, w, durs[0].Buckets[i]) + } + } +} + +// TestIssuanceMetrics_Concurrency stresses RecordIssuance under 100 +// goroutines × 1000 ops to assert atomic counter integrity. Race- +// detector clean is non-optional for this test (the whole point of +// IssuanceMetrics is concurrent recording from many service +// goroutines). +func TestIssuanceMetrics_Concurrency(t *testing.T) { + m := NewIssuanceMetrics(DefaultIssuanceBucketBoundaries) + + const goroutines = 100 + const opsPerGoroutine = 1000 + + var wg sync.WaitGroup + wg.Add(goroutines) + for i := 0; i < goroutines; i++ { + go func() { + defer wg.Done() + for j := 0; j < opsPerGoroutine; j++ { + m.RecordIssuance("local", "success", 50*time.Millisecond) + } + }() + } + wg.Wait() + + counters := m.SnapshotCounters() + if len(counters) != 1 { + t.Fatalf("expected 1 counter entry, got %d", len(counters)) + } + wantTotal := uint64(goroutines * opsPerGoroutine) + if counters[0].Count != wantTotal { + t.Errorf("counter under contention: want %d, got %d", wantTotal, counters[0].Count) + } + + durs := m.SnapshotDurations() + if durs[0].Count != wantTotal { + t.Errorf("histogram count under contention: want %d, got %d", wantTotal, durs[0].Count) + } +} + +// TestClassifyError exercises every branch of the closed-enum +// classifier. The classification logic is the load-bearing piece of +// the failure metric — misclassification doesn't break operators, but +// it makes their alerts noisier. Each enum value has at least one +// representative input. +func TestClassifyError(t *testing.T) { + cases := []struct { + name string + err error + want string + }{ + {"context_canceled", context.Canceled, "timeout"}, + {"context_deadline", context.DeadlineExceeded, "timeout"}, + {"timeout_substring", errors.New("operation deadline exceeded"), "timeout"}, + {"i_o_timeout", errors.New("read tcp: i/o timeout"), "timeout"}, + {"net_op_error", &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}, "network"}, + {"unauthorized_4xx", errors.New("DigiCert: 401 Unauthorized"), "auth"}, + {"access_denied_aws", errors.New("AccessDeniedException: not authorized"), "auth"}, + {"forbidden_403", errors.New("forbidden: insufficient permissions"), "auth"}, + {"rate_limited_429", errors.New("Sectigo: 429 too many requests"), "rate_limited"}, + {"throttled", errors.New("ThrottlingException: rate exceeded"), "rate_limited"}, + {"validation_csr", errors.New("malformed CSR: invalid PEM block"), "validation"}, + {"validation_invalid", errors.New("invalid signing algorithm"), "validation"}, + {"upstream_503", errors.New("ServiceUnavailable: 503"), "upstream_5xx"}, + {"upstream_500_internal", errors.New("Internal Server Error: 500"), "upstream_5xx"}, + {"upstream_404", errors.New("NotFound: 404 cert not found"), "upstream_4xx"}, + {"network_no_host", errors.New("dial tcp: no such host"), "network"}, + {"other_unmatched", errors.New("something completely unexpected happened"), "other"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := ClassifyError(tc.err) + if got != tc.want { + t.Errorf("ClassifyError(%q): want %q, got %q", tc.err.Error(), tc.want, got) + } + }) + } + + // Special case: nil → "" so callers that accidentally call us + // with a nil err don't bump the failure counter. + if got := ClassifyError(nil); got != "" { + t.Errorf("ClassifyError(nil): want \"\", got %q", got) + } +} diff --git a/internal/service/issuer_adapter.go b/internal/service/issuer_adapter.go index 4e616ac..e22913f 100644 --- a/internal/service/issuer_adapter.go +++ b/internal/service/issuer_adapter.go @@ -2,6 +2,7 @@ package service import ( "context" + "time" "github.com/shankar0123/certctl/internal/connector/issuer" ) @@ -9,15 +10,55 @@ import ( // IssuerConnectorAdapter bridges the connector-layer issuer.Connector interface with the // service-layer IssuerConnector interface. This maintains dependency inversion: the service // layer defines the interface it needs, and this adapter wraps the concrete connector. +// +// Metrics: when issuerType + metrics are set via SetMetrics, the adapter +// records every IssueCertificate / RenewCertificate call into the +// IssuanceMetrics tables (audit fix #4). Untyped or unmetricked +// adapters (test path) skip the recording — nil-guard everywhere. type IssuerConnectorAdapter struct { - connector issuer.Connector + connector issuer.Connector + issuerType string + metrics *IssuanceMetrics } -// NewIssuerConnectorAdapter wraps an issuer.Connector to implement service.IssuerConnector. +// NewIssuerConnectorAdapter wraps an issuer.Connector to implement +// service.IssuerConnector. Existing call sites (28+) keep this +// signature; metrics are wired via SetMetrics post-construction by +// the production code path (issuer_registry.go) so test sites that +// don't care about metrics stay one-arg. func NewIssuerConnectorAdapter(c issuer.Connector) IssuerConnector { return &IssuerConnectorAdapter{connector: c} } +// SetMetrics wires per-issuer-type issuance metrics. issuerType is the +// factory key (e.g. "local", "acme", "digicert") — must match one of +// the closed-enum values the metrics doc references. metrics may be +// nil to disable recording. Closes the #4 audit-readiness blocker +// (per-issuer-type metrics). +func (a *IssuerConnectorAdapter) SetMetrics(issuerType string, metrics *IssuanceMetrics) { + a.issuerType = issuerType + a.metrics = metrics +} + +// recordIssuance is the metrics-recording side effect at the adapter +// boundary. Bumps the issuance counter (success/failure) and the +// duration histogram; on failure also bumps the failure-by-error-class +// counter via ClassifyError. +// +// nil-guarded: when metrics or issuerType are unset, it's a no-op. +func (a *IssuerConnectorAdapter) recordIssuance(start time.Time, err error) { + if a.metrics == nil || a.issuerType == "" { + return + } + duration := time.Since(start) + if err != nil { + a.metrics.RecordIssuance(a.issuerType, "failure", duration) + a.metrics.RecordFailure(a.issuerType, ClassifyError(err)) + } else { + a.metrics.RecordIssuance(a.issuerType, "success", duration) + } +} + // IssueCertificate delegates to the underlying connector's IssueCertificate method, // translating between service-layer and connector-layer types. // @@ -26,6 +67,7 @@ func NewIssuerConnectorAdapter(c issuer.Connector) IssuerConnector { // honors it (RFC 7633 id-pe-tlsfeature extension); upstream connectors // silently ignore the field. func (a *IssuerConnectorAdapter) IssueCertificate(ctx context.Context, commonName string, sans []string, csrPEM string, ekus []string, maxTTLSeconds int, mustStaple bool) (*IssuanceResult, error) { + start := time.Now() result, err := a.connector.IssueCertificate(ctx, issuer.IssuanceRequest{ CommonName: commonName, SANs: sans, @@ -34,6 +76,7 @@ func (a *IssuerConnectorAdapter) IssueCertificate(ctx context.Context, commonNam MaxTTLSeconds: maxTTLSeconds, MustStaple: mustStaple, }) + a.recordIssuance(start, err) if err != nil { return nil, err } @@ -47,8 +90,12 @@ func (a *IssuerConnectorAdapter) IssueCertificate(ctx context.Context, commonNam } // RenewCertificate delegates to the underlying connector's RenewCertificate method, -// translating between service-layer and connector-layer types. +// translating between service-layer and connector-layer types. Metrics: +// renewal is recorded into the same certctl_issuance_* series as +// initial issuance — operationally, renewal IS issuance from the +// connector's perspective. func (a *IssuerConnectorAdapter) RenewCertificate(ctx context.Context, commonName string, sans []string, csrPEM string, ekus []string, maxTTLSeconds int, mustStaple bool) (*IssuanceResult, error) { + start := time.Now() result, err := a.connector.RenewCertificate(ctx, issuer.RenewalRequest{ CommonName: commonName, SANs: sans, @@ -57,6 +104,7 @@ func (a *IssuerConnectorAdapter) RenewCertificate(ctx context.Context, commonNam MaxTTLSeconds: maxTTLSeconds, MustStaple: mustStaple, }) + a.recordIssuance(start, err) if err != nil { return nil, err } diff --git a/internal/service/issuer_registry.go b/internal/service/issuer_registry.go index edd9aa4..fb1e49f 100644 --- a/internal/service/issuer_registry.go +++ b/internal/service/issuer_registry.go @@ -32,6 +32,12 @@ type IssuerRegistry struct { // with the CA key directly (the historical behaviour, preserved for // callers that don't supply these deps). localDeps *LocalIssuerDeps + + // metrics — when set, every adapter constructed by Rebuild is + // wired with SetMetrics so issuance / renewal calls flow through + // the per-issuer-type counter + histogram + failure tables. + // Closes the #4 audit-readiness blocker (per-issuer-type metrics). + metrics *IssuanceMetrics } // LocalIssuerDeps groups the optional dependencies that the local @@ -67,6 +73,16 @@ func (r *IssuerRegistry) SetLocalIssuerDeps(deps *LocalIssuerDeps) { r.localDeps = deps } +// SetIssuanceMetrics wires per-issuer-type issuance metrics. Every +// adapter constructed by Rebuild after this call records issuance / +// renewal calls into the supplied metrics tables. Closes the #4 +// audit-readiness blocker (per-issuer-type metrics). +func (r *IssuerRegistry) SetIssuanceMetrics(m *IssuanceMetrics) { + r.mu.Lock() + defer r.mu.Unlock() + r.metrics = m +} + // Get returns the issuer connector for the given ID and whether it exists. func (r *IssuerRegistry) Get(id string) (IssuerConnector, bool) { r.mu.RLock() @@ -173,7 +189,19 @@ func (r *IssuerRegistry) Rebuild(ctx context.Context, configs []*domain.Issuer, "key_dir", r.localDeps.KeyDir) } - newIssuers[cfg.ID] = NewIssuerConnectorAdapter(connector) + adapter := NewIssuerConnectorAdapter(connector) + // Wire per-issuer-type metrics (audit fix #4) when SetIssuanceMetrics + // was called. The adapter is the IssuerConnector interface; type- + // assert to the concrete *IssuerConnectorAdapter so we can call + // SetMetrics. Tests that hand-construct adapters via the bare + // NewIssuerConnectorAdapter constructor get nil metrics — the + // adapter no-ops the recording in that case. + if r.metrics != nil { + if a, ok := adapter.(*IssuerConnectorAdapter); ok { + a.SetMetrics(string(cfg.Type), r.metrics) + } + } + newIssuers[cfg.ID] = adapter r.logger.Info("issuer loaded into registry", "id", cfg.ID, "type", cfg.Type) }