Files
certctl/internal/service/issuance_metrics.go
T
shankar0123 21aeed4f4e legal: addlicense headers + normalize legacy variants (Phase 0 RED-4)
Phase 0 closure (Path B2, post-rewrite):

addlicense sweep — adds the canonical certctl LLC copyright + BUSL-1.1
SPDX header to every production Go file. Template:

  // Copyright 2026 certctl LLC. All rights reserved.
  // SPDX-License-Identifier: BUSL-1.1

Coverage: 338 / 338 production Go files (cmd/ + internal/, excluding
*_test.go and **/testdata/**). Pre-sweep coverage was 22 / 338 (6.5%);
post-sweep is 338 / 338 (100%).

Normalized 22 pre-existing legacy headers (`// Copyright (c) certctl`
+ `// SPDX-License-Identifier: BSL-1.1`) and 1 file using a
`Certctl Contributors` attribution. The legacy SPDX ID `BSL-1.1`
is non-standard; the official SPDX identifier for Business Source
License 1.1 is `BUSL-1.1` (capital U). All 338 files now share the
canonical form.

Generated via:
  addlicense -c "certctl LLC" -y 2026 \
    -f cowork/legal/copyright-header.tpl \
    -ignore '**/testdata/**' -ignore '**/*_test.go' \
    cmd/ internal/

Verification:
  find cmd internal -name '*.go' -not -name '*_test.go' \
    -not -path '*/testdata/*' \
    -exec grep -L '^// Copyright 2026 certctl LLC' {} \; | wc -l

  Returns: 0

gofmt clean. Header additions are comments only, no compile impact.

Closes: cowork/certctl-architecture-diligence-audit.html#fix-RED-4
2026-05-13 21:23:35 +00:00

367 lines
12 KiB
Go

// Copyright 2026 certctl LLC. All rights reserved.
// SPDX-License-Identifier: BUSL-1.1
package service
import (
"context"
"errors"
"net"
"strings"
"sync"
"sync/atomic"
"time"
)
// IssuanceCounterEntry is one (issuer_type, outcome, count) tuple
// emitted by the per-issuer-type issuance counter table. Closes the
// #4 acquisition-readiness blocker from the 2026-05-01 issuer coverage
// audit (per-issuer-type metrics).
type IssuanceCounterEntry struct {
IssuerType string
Outcome string // "success" | "failure"
Count uint64
}
// IssuanceFailureEntry is one (issuer_type, error_class, count) tuple
// emitted by the issuance-failure counter table. error_class is a
// closed enum of eight values (timeout, auth, rate_limited,
// validation, upstream_5xx, upstream_4xx, network, other) — cardinality
// discipline keeps this metric tractable.
type IssuanceFailureEntry struct {
IssuerType string
ErrorClass string
Count uint64
}
// IssuanceDurationEntry is one (issuer_type, bucket-counts, sum, count)
// tuple emitted by the issuance-duration histogram. Buckets carries
// cumulative counts in the order matching the BucketBoundaries
// reported by the snapshotter; Sum is total observed seconds; Count
// is total observations (matches the +Inf bucket).
type IssuanceDurationEntry struct {
IssuerType string
Buckets []uint64
Sum float64
Count uint64
}
// IssuanceMetricsSnapshotter is the surface MetricsHandler consumes
// for per-issuer-type issuance metrics. The handler imports this
// interface so the snapshot types stay in the service package
// (avoids an import cycle: handler imports service for the
// admin_est / admin_scep_intune handlers, so the reverse direction
// can't import handler).
//
// *IssuanceMetrics satisfies this interface; the production wiring
// in cmd/server/main.go passes the same instance into both the
// IssuerRegistry (for adapter-side recording) and the MetricsHandler
// (for Prometheus exposition).
type IssuanceMetricsSnapshotter interface {
SnapshotCounters() []IssuanceCounterEntry
SnapshotFailures() []IssuanceFailureEntry
SnapshotDurations() []IssuanceDurationEntry
BucketBoundaries() []float64
}
// DefaultIssuanceBucketBoundaries covers the local-issuer fast path
// (sub-100ms signing) through the async-CA slow path (DigiCert /
// Sectigo / Entrust polling can take minutes). The +Inf bucket is
// appended by the Prometheus exposer; we don't include it here.
//
// Boundaries chosen for operator alerting: 0.05s catches when the
// local issuer's signer has gone non-cooperative; 30s catches when
// an async CA is slow but not stuck; 120s catches when polling has
// effectively stalled.
var DefaultIssuanceBucketBoundaries = []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120}
// IssuanceMetrics is a thread-safe in-memory counter + histogram table
// for per-issuer-type issuance signals. Closes the #4 acquisition-
// readiness blocker from the 2026-05-01 issuer coverage audit
// (per-issuer-type metrics).
//
// Three independent views — counter, failures, durations — are exposed
// via the Snapshot* methods so handler.IssuanceMetricsSnapshotter is
// satisfied.
//
// Cardinality is bounded by:
// - Closed enum of issuer types (12 currently)
// - "success" / "failure" outcome strings (2)
// - 8-value error_class enum (timeout, auth, rate_limited,
// validation, upstream_5xx, upstream_4xx, network, other)
// - Fixed bucket boundaries (11 + implicit +Inf in exposer)
//
// Underlying maps grow to a fixed upper bound and stop. A new issuer
// type appears once and never explodes the cardinality.
type IssuanceMetrics struct {
bucketBoundaries []float64
mu sync.RWMutex
counters map[counterKey]*atomic.Uint64
failures map[failureKey]*atomic.Uint64
durations map[string]*durationState // key: issuer_type
}
type counterKey struct{ IssuerType, Outcome string }
type failureKey struct{ IssuerType, ErrorClass string }
type durationState struct {
buckets []atomic.Uint64
// sumMillis stores the sum in milliseconds (uint64-encoded) so we
// can use atomic adds; the snapshot converts back to float seconds.
sumMillis atomic.Uint64
count atomic.Uint64
}
// NewIssuanceMetrics constructs a fresh IssuanceMetrics with the given
// bucket boundaries. Pass DefaultIssuanceBucketBoundaries unless tests
// need a different shape.
func NewIssuanceMetrics(buckets []float64) *IssuanceMetrics {
cp := make([]float64, len(buckets))
copy(cp, buckets)
return &IssuanceMetrics{
bucketBoundaries: cp,
counters: make(map[counterKey]*atomic.Uint64),
failures: make(map[failureKey]*atomic.Uint64),
durations: make(map[string]*durationState),
}
}
// RecordIssuance bumps the (issuer_type, outcome) counter and observes
// the duration into the (issuer_type) histogram. outcome is
// "success" or "failure"; pass "" only if you intend to record neither
// (the call returns without effect).
func (m *IssuanceMetrics) RecordIssuance(issuerType, outcome string, duration time.Duration) {
if issuerType == "" || outcome == "" {
return
}
m.bumpCounter(counterKey{IssuerType: issuerType, Outcome: outcome})
m.observeDuration(issuerType, duration)
}
// RecordFailure bumps the (issuer_type, error_class) failure counter.
// Caller is responsible for classifying the error via ClassifyError;
// passing an off-enum value will silently grow the cardinality
// (closed-enum discipline is the caller's contract).
func (m *IssuanceMetrics) RecordFailure(issuerType, errorClass string) {
if issuerType == "" || errorClass == "" {
return
}
m.bumpFailure(failureKey{IssuerType: issuerType, ErrorClass: errorClass})
}
func (m *IssuanceMetrics) bumpCounter(k counterKey) {
m.mu.RLock()
c, ok := m.counters[k]
m.mu.RUnlock()
if !ok {
m.mu.Lock()
c, ok = m.counters[k]
if !ok {
c = new(atomic.Uint64)
m.counters[k] = c
}
m.mu.Unlock()
}
c.Add(1)
}
func (m *IssuanceMetrics) bumpFailure(k failureKey) {
m.mu.RLock()
c, ok := m.failures[k]
m.mu.RUnlock()
if !ok {
m.mu.Lock()
c, ok = m.failures[k]
if !ok {
c = new(atomic.Uint64)
m.failures[k] = c
}
m.mu.Unlock()
}
c.Add(1)
}
func (m *IssuanceMetrics) observeDuration(issuerType string, duration time.Duration) {
m.mu.RLock()
state, ok := m.durations[issuerType]
m.mu.RUnlock()
if !ok {
m.mu.Lock()
state, ok = m.durations[issuerType]
if !ok {
state = &durationState{
buckets: make([]atomic.Uint64, len(m.bucketBoundaries)),
}
m.durations[issuerType] = state
}
m.mu.Unlock()
}
seconds := duration.Seconds()
// Cumulative buckets: bump every bucket whose boundary >= seconds.
for i, le := range m.bucketBoundaries {
if seconds <= le {
state.buckets[i].Add(1)
}
}
// sumMillis: store the duration in milliseconds (uint64) to keep
// atomic. Snapshot converts back to seconds.
state.sumMillis.Add(uint64(duration.Milliseconds()))
state.count.Add(1)
}
// SnapshotCounters returns a stable copy of the (issuer_type, outcome,
// count) tuples. Safe to call concurrently with RecordIssuance.
func (m *IssuanceMetrics) SnapshotCounters() []IssuanceCounterEntry {
m.mu.RLock()
defer m.mu.RUnlock()
out := make([]IssuanceCounterEntry, 0, len(m.counters))
for k, v := range m.counters {
out = append(out, IssuanceCounterEntry{
IssuerType: k.IssuerType,
Outcome: k.Outcome,
Count: v.Load(),
})
}
return out
}
// SnapshotFailures returns a stable copy of the (issuer_type,
// error_class, count) tuples. Safe to call concurrently.
func (m *IssuanceMetrics) SnapshotFailures() []IssuanceFailureEntry {
m.mu.RLock()
defer m.mu.RUnlock()
out := make([]IssuanceFailureEntry, 0, len(m.failures))
for k, v := range m.failures {
out = append(out, IssuanceFailureEntry{
IssuerType: k.IssuerType,
ErrorClass: k.ErrorClass,
Count: v.Load(),
})
}
return out
}
// SnapshotDurations returns a stable copy of the (issuer_type, buckets,
// sum, count) tuples. The buckets slice is in the order matching
// BucketBoundaries(); sum is in seconds. Safe to call concurrently.
func (m *IssuanceMetrics) SnapshotDurations() []IssuanceDurationEntry {
m.mu.RLock()
defer m.mu.RUnlock()
out := make([]IssuanceDurationEntry, 0, len(m.durations))
for issuerType, state := range m.durations {
buckets := make([]uint64, len(state.buckets))
for i := range state.buckets {
buckets[i] = state.buckets[i].Load()
}
out = append(out, IssuanceDurationEntry{
IssuerType: issuerType,
Buckets: buckets,
Sum: float64(state.sumMillis.Load()) / 1000.0,
Count: state.count.Load(),
})
}
return out
}
// BucketBoundaries returns a copy of the bucket boundaries used by
// this IssuanceMetrics. Used by the Prometheus exposer to label the
// histogram buckets.
func (m *IssuanceMetrics) BucketBoundaries() []float64 {
out := make([]float64, len(m.bucketBoundaries))
copy(out, m.bucketBoundaries)
return out
}
// Compile-time guard: *IssuanceMetrics satisfies
// IssuanceMetricsSnapshotter.
var _ IssuanceMetricsSnapshotter = (*IssuanceMetrics)(nil)
// ClassifyError maps an arbitrary error to one of eight closed-enum
// error_class values. The classification is deterministic and runs in
// constant time (no regex compilation, no reflection beyond
// errors.Is / errors.As).
//
// Closed enum: timeout, auth, rate_limited, validation, upstream_5xx,
// upstream_4xx, network, other. Adding a ninth value is a deliberate
// change that requires updating the docs/metrics.md enum list and
// any operator alerting rules that pin specific labels — do NOT
// expand the enum casually; classify edge cases as "other" and
// document the case if it matters.
func ClassifyError(err error) string {
if err == nil {
return "" // caller should not invoke us with nil
}
// 1. Context deadline / cancellation → timeout (the operator
// alerts on slow upstream CAs via this label).
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
return "timeout"
}
// 2. Network-layer errors (connection refused, DNS, TLS handshake)
// → network. Detected via *net.OpError or strings the stdlib
// uses for these conditions.
var opErr *net.OpError
if errors.As(err, &opErr) {
return "network"
}
msg := strings.ToLower(err.Error())
// 3. Substring matches against the most common upstream-CA error
// shapes. Order matters — auth and rate-limited need to win
// over generic 4xx, and 5xx needs to win over generic
// "internal" matches.
switch {
case strings.Contains(msg, "deadline exceeded"),
strings.Contains(msg, "timeout"),
strings.Contains(msg, "i/o timeout"):
return "timeout"
case strings.Contains(msg, "401"),
strings.Contains(msg, "unauthorized"),
strings.Contains(msg, "accessdenied"),
strings.Contains(msg, "access denied"),
strings.Contains(msg, "forbidden"):
return "auth"
case strings.Contains(msg, "429"),
strings.Contains(msg, "ratelimit"),
strings.Contains(msg, "rate limit"),
strings.Contains(msg, "throttl"):
return "rate_limited"
case strings.Contains(msg, "csr"),
strings.Contains(msg, "validate"),
strings.Contains(msg, "validation"),
strings.Contains(msg, "invalid"),
strings.Contains(msg, "malformed"):
return "validation"
case strings.Contains(msg, "500"),
strings.Contains(msg, "502"),
strings.Contains(msg, "503"),
strings.Contains(msg, "504"),
strings.Contains(msg, "5xx"),
strings.Contains(msg, "serviceunavailable"),
strings.Contains(msg, "service unavailable"),
strings.Contains(msg, "internalerror"),
strings.Contains(msg, "internal server error"):
return "upstream_5xx"
case strings.Contains(msg, "404"),
strings.Contains(msg, "400"),
strings.Contains(msg, "4xx"),
strings.Contains(msg, "notfound"),
strings.Contains(msg, "not found"),
strings.Contains(msg, "badrequest"),
strings.Contains(msg, "bad request"):
return "upstream_4xx"
case strings.Contains(msg, "no such host"),
strings.Contains(msg, "connection refused"),
strings.Contains(msg, "tls handshake"),
strings.Contains(msg, "network"),
strings.Contains(msg, "dial tcp"),
strings.Contains(msg, "broken pipe"):
return "network"
}
return "other"
}