mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 15:01:32 +00:00
3b92048242
Closes the #4 acquisition-readiness blocker from the 2026-05-01 issuer coverage audit. Before this commit, certctl's Prometheus exposition had zero per-issuer-type signal — operators answering "is DigiCert slow?" or "is Sectigo failing more than ACME?" had to grep logs by issuer name. This commit adds three series labelled by issuer type: certctl_issuance_total{issuer_type, outcome} certctl_issuance_duration_seconds{issuer_type} (histogram) certctl_issuance_failures_total{issuer_type, error_class} The histogram covers 0.05–120 second buckets to span the local-issuer fast path and async-CA slow path (DigiCert/Sectigo/Entrust polling can take minutes). error_class is a closed enum of eight values (timeout, auth, rate_limited, validation, upstream_5xx, upstream_4xx, network, other) classified once in service.ClassifyError. Cardinality budget is ~276 new series, well within Prometheus's comfortable range. Implementation: - service.IssuanceMetrics is the thread-safe counter + histogram table. Three independent views (counters / failures / durations) exposed via SnapshotCounters / SnapshotFailures / SnapshotDurations. sync.RWMutex protects the map shape; per-key sync/atomic.Uint64 primitives keep the recording hot path lock-free under concurrent service-layer goroutines. - service.IssuanceCounterEntry / IssuanceFailureEntry / IssuanceDurationEntry / IssuanceMetricsSnapshotter live in service (not handler) to avoid an import cycle: handler already imports service for admin_est.go etc., so service can't import handler back. Handler's exposer takes the snapshotter via the service-defined interface. - service.ClassifyError pure function maps error → error_class. context.DeadlineExceeded / context.Canceled → timeout; *net.OpError → network; substring matches against canonical AWS / DigiCert / Sectigo error shapes for auth / rate_limited / validation / upstream_5xx / upstream_4xx / network; unknown → other. Each branch has at least one representative test case in TestClassifyError. - IssuerConnectorAdapter.SetMetrics wires per-adapter recording (issuerType + metrics). Existing 28+ test call sites of NewIssuerConnectorAdapter keep their one-arg signature; production wiring goes through SetMetrics post-construction. - IssuerRegistry.SetIssuanceMetrics + Rebuild type-asserts to *IssuerConnectorAdapter and calls SetMetrics with the issuer type string. nil-guarded — tests that hand-build adapters without metrics get no-op recording. - IssuerConnectorAdapter.IssueCertificate / RenewCertificate wrap the underlying connector call with start := time.Now() and recordIssuance(start, err). Renewal is recorded into the same certctl_issuance_* series as initial issuance — operationally, renewal IS issuance from the connector's perspective (matches the audit prompt's guidance on series naming). - handler/metrics.go GetPrometheusMetrics gains a new exposer block emitting all three series in stable label order with correct Prometheus format (_bucket / _sum / _count for the histogram, +Inf bucket appended). Sorted via sort.Slice for stable output. nil- guarded so deploys without the wire produce clean exposition. - formatLE helper trims trailing zeros from histogram bucket labels via strconv.FormatFloat(le, 'f', -1, 64) so the `le` labels match Prometheus client conventions ("0.05", "30", "120", not "0.0500" etc.). - cmd/server/main.go wires a single IssuanceMetrics instance into both the IssuerRegistry (recording) and the MetricsHandler (exposer) using DefaultIssuanceBucketBoundaries. Tests: - TestIssuanceMetrics_RecordAndSnapshot — happy-path counter + histogram + failure recording, BucketBoundaries returns a copy (not shared storage). - TestIssuanceMetrics_HistogramCumulative — pins the cumulative-buckets contract. 100ms observation lands in 0.1 bucket and every larger bucket; 750ms only in the 1.0 bucket. Off-by-one here would corrupt every quantile query downstream. - TestIssuanceMetrics_Concurrency — 100 goroutines × 1000 ops under the race detector. Asserts atomic counter integrity across contended writes. - TestClassifyError — 17 cases covering every branch of the closed enum plus the nil-error special case. Implementation chooses the existing hand-rolled fmt.Fprintf exposition pattern (no prometheus/client_golang dependency added) to stay consistent with the OCSP / deploy counter blocks already in the file. Out of scope (separate follow-ups): - Revocation metrics (certctl_revocation_*) — symmetric to issuance but the audit didn't ask; explicit follow-up commit. - Discovery / health-check duration histograms. - prometheus/client_golang migration. Verified locally: - gofmt clean - go vet ./... clean - staticcheck ./... clean - golangci-lint run --timeout 5m ./... → 0 issues - go test -short -count=1 ./internal/service/ green - go test -short -count=1 -race -run TestIssuanceMetrics ./internal/service/ green - go test -short -count=1 ./internal/api/handler/ green - go build ./... success Audit reference: cowork/issuer-coverage-audit-2026-05-01/RESULTS.md Top-10 fix #4 (Part 3, narrative section).
367 lines
12 KiB
Go
367 lines
12 KiB
Go
// Copyright (c) certctl
|
|
// SPDX-License-Identifier: BSL-1.1
|
|
|
|
package service
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"net"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
// IssuanceCounterEntry is one (issuer_type, outcome, count) tuple
|
|
// emitted by the per-issuer-type issuance counter table. Closes the
|
|
// #4 acquisition-readiness blocker from the 2026-05-01 issuer coverage
|
|
// audit (per-issuer-type metrics).
|
|
type IssuanceCounterEntry struct {
|
|
IssuerType string
|
|
Outcome string // "success" | "failure"
|
|
Count uint64
|
|
}
|
|
|
|
// IssuanceFailureEntry is one (issuer_type, error_class, count) tuple
|
|
// emitted by the issuance-failure counter table. error_class is a
|
|
// closed enum of eight values (timeout, auth, rate_limited,
|
|
// validation, upstream_5xx, upstream_4xx, network, other) — cardinality
|
|
// discipline keeps this metric tractable.
|
|
type IssuanceFailureEntry struct {
|
|
IssuerType string
|
|
ErrorClass string
|
|
Count uint64
|
|
}
|
|
|
|
// IssuanceDurationEntry is one (issuer_type, bucket-counts, sum, count)
|
|
// tuple emitted by the issuance-duration histogram. Buckets carries
|
|
// cumulative counts in the order matching the BucketBoundaries
|
|
// reported by the snapshotter; Sum is total observed seconds; Count
|
|
// is total observations (matches the +Inf bucket).
|
|
type IssuanceDurationEntry struct {
|
|
IssuerType string
|
|
Buckets []uint64
|
|
Sum float64
|
|
Count uint64
|
|
}
|
|
|
|
// IssuanceMetricsSnapshotter is the surface MetricsHandler consumes
|
|
// for per-issuer-type issuance metrics. The handler imports this
|
|
// interface so the snapshot types stay in the service package
|
|
// (avoids an import cycle: handler imports service for the
|
|
// admin_est / admin_scep_intune handlers, so the reverse direction
|
|
// can't import handler).
|
|
//
|
|
// *IssuanceMetrics satisfies this interface; the production wiring
|
|
// in cmd/server/main.go passes the same instance into both the
|
|
// IssuerRegistry (for adapter-side recording) and the MetricsHandler
|
|
// (for Prometheus exposition).
|
|
type IssuanceMetricsSnapshotter interface {
|
|
SnapshotCounters() []IssuanceCounterEntry
|
|
SnapshotFailures() []IssuanceFailureEntry
|
|
SnapshotDurations() []IssuanceDurationEntry
|
|
BucketBoundaries() []float64
|
|
}
|
|
|
|
// DefaultIssuanceBucketBoundaries covers the local-issuer fast path
|
|
// (sub-100ms signing) through the async-CA slow path (DigiCert /
|
|
// Sectigo / Entrust polling can take minutes). The +Inf bucket is
|
|
// appended by the Prometheus exposer; we don't include it here.
|
|
//
|
|
// Boundaries chosen for operator alerting: 0.05s catches when the
|
|
// local issuer's signer has gone non-cooperative; 30s catches when
|
|
// an async CA is slow but not stuck; 120s catches when polling has
|
|
// effectively stalled.
|
|
var DefaultIssuanceBucketBoundaries = []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120}
|
|
|
|
// IssuanceMetrics is a thread-safe in-memory counter + histogram table
|
|
// for per-issuer-type issuance signals. Closes the #4 acquisition-
|
|
// readiness blocker from the 2026-05-01 issuer coverage audit
|
|
// (per-issuer-type metrics).
|
|
//
|
|
// Three independent views — counter, failures, durations — are exposed
|
|
// via the Snapshot* methods so handler.IssuanceMetricsSnapshotter is
|
|
// satisfied.
|
|
//
|
|
// Cardinality is bounded by:
|
|
// - Closed enum of issuer types (12 currently)
|
|
// - "success" / "failure" outcome strings (2)
|
|
// - 8-value error_class enum (timeout, auth, rate_limited,
|
|
// validation, upstream_5xx, upstream_4xx, network, other)
|
|
// - Fixed bucket boundaries (11 + implicit +Inf in exposer)
|
|
//
|
|
// Underlying maps grow to a fixed upper bound and stop. A new issuer
|
|
// type appears once and never explodes the cardinality.
|
|
type IssuanceMetrics struct {
|
|
bucketBoundaries []float64
|
|
|
|
mu sync.RWMutex
|
|
counters map[counterKey]*atomic.Uint64
|
|
failures map[failureKey]*atomic.Uint64
|
|
durations map[string]*durationState // key: issuer_type
|
|
}
|
|
|
|
type counterKey struct{ IssuerType, Outcome string }
|
|
type failureKey struct{ IssuerType, ErrorClass string }
|
|
|
|
type durationState struct {
|
|
buckets []atomic.Uint64
|
|
// sumMillis stores the sum in milliseconds (uint64-encoded) so we
|
|
// can use atomic adds; the snapshot converts back to float seconds.
|
|
sumMillis atomic.Uint64
|
|
count atomic.Uint64
|
|
}
|
|
|
|
// NewIssuanceMetrics constructs a fresh IssuanceMetrics with the given
|
|
// bucket boundaries. Pass DefaultIssuanceBucketBoundaries unless tests
|
|
// need a different shape.
|
|
func NewIssuanceMetrics(buckets []float64) *IssuanceMetrics {
|
|
cp := make([]float64, len(buckets))
|
|
copy(cp, buckets)
|
|
return &IssuanceMetrics{
|
|
bucketBoundaries: cp,
|
|
counters: make(map[counterKey]*atomic.Uint64),
|
|
failures: make(map[failureKey]*atomic.Uint64),
|
|
durations: make(map[string]*durationState),
|
|
}
|
|
}
|
|
|
|
// RecordIssuance bumps the (issuer_type, outcome) counter and observes
|
|
// the duration into the (issuer_type) histogram. outcome is
|
|
// "success" or "failure"; pass "" only if you intend to record neither
|
|
// (the call returns without effect).
|
|
func (m *IssuanceMetrics) RecordIssuance(issuerType, outcome string, duration time.Duration) {
|
|
if issuerType == "" || outcome == "" {
|
|
return
|
|
}
|
|
m.bumpCounter(counterKey{IssuerType: issuerType, Outcome: outcome})
|
|
m.observeDuration(issuerType, duration)
|
|
}
|
|
|
|
// RecordFailure bumps the (issuer_type, error_class) failure counter.
|
|
// Caller is responsible for classifying the error via ClassifyError;
|
|
// passing an off-enum value will silently grow the cardinality
|
|
// (closed-enum discipline is the caller's contract).
|
|
func (m *IssuanceMetrics) RecordFailure(issuerType, errorClass string) {
|
|
if issuerType == "" || errorClass == "" {
|
|
return
|
|
}
|
|
m.bumpFailure(failureKey{IssuerType: issuerType, ErrorClass: errorClass})
|
|
}
|
|
|
|
func (m *IssuanceMetrics) bumpCounter(k counterKey) {
|
|
m.mu.RLock()
|
|
c, ok := m.counters[k]
|
|
m.mu.RUnlock()
|
|
if !ok {
|
|
m.mu.Lock()
|
|
c, ok = m.counters[k]
|
|
if !ok {
|
|
c = new(atomic.Uint64)
|
|
m.counters[k] = c
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
c.Add(1)
|
|
}
|
|
|
|
func (m *IssuanceMetrics) bumpFailure(k failureKey) {
|
|
m.mu.RLock()
|
|
c, ok := m.failures[k]
|
|
m.mu.RUnlock()
|
|
if !ok {
|
|
m.mu.Lock()
|
|
c, ok = m.failures[k]
|
|
if !ok {
|
|
c = new(atomic.Uint64)
|
|
m.failures[k] = c
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
c.Add(1)
|
|
}
|
|
|
|
func (m *IssuanceMetrics) observeDuration(issuerType string, duration time.Duration) {
|
|
m.mu.RLock()
|
|
state, ok := m.durations[issuerType]
|
|
m.mu.RUnlock()
|
|
if !ok {
|
|
m.mu.Lock()
|
|
state, ok = m.durations[issuerType]
|
|
if !ok {
|
|
state = &durationState{
|
|
buckets: make([]atomic.Uint64, len(m.bucketBoundaries)),
|
|
}
|
|
m.durations[issuerType] = state
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
seconds := duration.Seconds()
|
|
// Cumulative buckets: bump every bucket whose boundary >= seconds.
|
|
for i, le := range m.bucketBoundaries {
|
|
if seconds <= le {
|
|
state.buckets[i].Add(1)
|
|
}
|
|
}
|
|
// sumMillis: store the duration in milliseconds (uint64) to keep
|
|
// atomic. Snapshot converts back to seconds.
|
|
state.sumMillis.Add(uint64(duration.Milliseconds()))
|
|
state.count.Add(1)
|
|
}
|
|
|
|
// SnapshotCounters returns a stable copy of the (issuer_type, outcome,
|
|
// count) tuples. Safe to call concurrently with RecordIssuance.
|
|
func (m *IssuanceMetrics) SnapshotCounters() []IssuanceCounterEntry {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
out := make([]IssuanceCounterEntry, 0, len(m.counters))
|
|
for k, v := range m.counters {
|
|
out = append(out, IssuanceCounterEntry{
|
|
IssuerType: k.IssuerType,
|
|
Outcome: k.Outcome,
|
|
Count: v.Load(),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// SnapshotFailures returns a stable copy of the (issuer_type,
|
|
// error_class, count) tuples. Safe to call concurrently.
|
|
func (m *IssuanceMetrics) SnapshotFailures() []IssuanceFailureEntry {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
out := make([]IssuanceFailureEntry, 0, len(m.failures))
|
|
for k, v := range m.failures {
|
|
out = append(out, IssuanceFailureEntry{
|
|
IssuerType: k.IssuerType,
|
|
ErrorClass: k.ErrorClass,
|
|
Count: v.Load(),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// SnapshotDurations returns a stable copy of the (issuer_type, buckets,
|
|
// sum, count) tuples. The buckets slice is in the order matching
|
|
// BucketBoundaries(); sum is in seconds. Safe to call concurrently.
|
|
func (m *IssuanceMetrics) SnapshotDurations() []IssuanceDurationEntry {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
out := make([]IssuanceDurationEntry, 0, len(m.durations))
|
|
for issuerType, state := range m.durations {
|
|
buckets := make([]uint64, len(state.buckets))
|
|
for i := range state.buckets {
|
|
buckets[i] = state.buckets[i].Load()
|
|
}
|
|
out = append(out, IssuanceDurationEntry{
|
|
IssuerType: issuerType,
|
|
Buckets: buckets,
|
|
Sum: float64(state.sumMillis.Load()) / 1000.0,
|
|
Count: state.count.Load(),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// BucketBoundaries returns a copy of the bucket boundaries used by
|
|
// this IssuanceMetrics. Used by the Prometheus exposer to label the
|
|
// histogram buckets.
|
|
func (m *IssuanceMetrics) BucketBoundaries() []float64 {
|
|
out := make([]float64, len(m.bucketBoundaries))
|
|
copy(out, m.bucketBoundaries)
|
|
return out
|
|
}
|
|
|
|
// Compile-time guard: *IssuanceMetrics satisfies
|
|
// IssuanceMetricsSnapshotter.
|
|
var _ IssuanceMetricsSnapshotter = (*IssuanceMetrics)(nil)
|
|
|
|
// ClassifyError maps an arbitrary error to one of eight closed-enum
|
|
// error_class values. The classification is deterministic and runs in
|
|
// constant time (no regex compilation, no reflection beyond
|
|
// errors.Is / errors.As).
|
|
//
|
|
// Closed enum: timeout, auth, rate_limited, validation, upstream_5xx,
|
|
// upstream_4xx, network, other. Adding a ninth value is a deliberate
|
|
// change that requires updating the docs/metrics.md enum list and
|
|
// any operator alerting rules that pin specific labels — do NOT
|
|
// expand the enum casually; classify edge cases as "other" and
|
|
// document the case if it matters.
|
|
func ClassifyError(err error) string {
|
|
if err == nil {
|
|
return "" // caller should not invoke us with nil
|
|
}
|
|
|
|
// 1. Context deadline / cancellation → timeout (the operator
|
|
// alerts on slow upstream CAs via this label).
|
|
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
|
|
return "timeout"
|
|
}
|
|
|
|
// 2. Network-layer errors (connection refused, DNS, TLS handshake)
|
|
// → network. Detected via *net.OpError or strings the stdlib
|
|
// uses for these conditions.
|
|
var opErr *net.OpError
|
|
if errors.As(err, &opErr) {
|
|
return "network"
|
|
}
|
|
|
|
msg := strings.ToLower(err.Error())
|
|
|
|
// 3. Substring matches against the most common upstream-CA error
|
|
// shapes. Order matters — auth and rate-limited need to win
|
|
// over generic 4xx, and 5xx needs to win over generic
|
|
// "internal" matches.
|
|
switch {
|
|
case strings.Contains(msg, "deadline exceeded"),
|
|
strings.Contains(msg, "timeout"),
|
|
strings.Contains(msg, "i/o timeout"):
|
|
return "timeout"
|
|
case strings.Contains(msg, "401"),
|
|
strings.Contains(msg, "unauthorized"),
|
|
strings.Contains(msg, "accessdenied"),
|
|
strings.Contains(msg, "access denied"),
|
|
strings.Contains(msg, "forbidden"):
|
|
return "auth"
|
|
case strings.Contains(msg, "429"),
|
|
strings.Contains(msg, "ratelimit"),
|
|
strings.Contains(msg, "rate limit"),
|
|
strings.Contains(msg, "throttl"):
|
|
return "rate_limited"
|
|
case strings.Contains(msg, "csr"),
|
|
strings.Contains(msg, "validate"),
|
|
strings.Contains(msg, "validation"),
|
|
strings.Contains(msg, "invalid"),
|
|
strings.Contains(msg, "malformed"):
|
|
return "validation"
|
|
case strings.Contains(msg, "500"),
|
|
strings.Contains(msg, "502"),
|
|
strings.Contains(msg, "503"),
|
|
strings.Contains(msg, "504"),
|
|
strings.Contains(msg, "5xx"),
|
|
strings.Contains(msg, "serviceunavailable"),
|
|
strings.Contains(msg, "service unavailable"),
|
|
strings.Contains(msg, "internalerror"),
|
|
strings.Contains(msg, "internal server error"):
|
|
return "upstream_5xx"
|
|
case strings.Contains(msg, "404"),
|
|
strings.Contains(msg, "400"),
|
|
strings.Contains(msg, "4xx"),
|
|
strings.Contains(msg, "notfound"),
|
|
strings.Contains(msg, "not found"),
|
|
strings.Contains(msg, "badrequest"),
|
|
strings.Contains(msg, "bad request"):
|
|
return "upstream_4xx"
|
|
case strings.Contains(msg, "no such host"),
|
|
strings.Contains(msg, "connection refused"),
|
|
strings.Contains(msg, "tls handshake"),
|
|
strings.Contains(msg, "network"),
|
|
strings.Contains(msg, "dial tcp"),
|
|
strings.Contains(msg, "broken pipe"):
|
|
return "network"
|
|
}
|
|
return "other"
|
|
}
|