Files
certctl/internal/service/approval_metrics.go
T
shankar0123 8043e2bbac service: ApprovalService + ApprovalMetrics + 8 table-driven tests
Rank 7 of the 2026-05-03 Infisical deep-research deliverable, commit 2 of 4
(cowork/rank-7-approval-workflow-primitive-prompt.md). Builds on the
foundation in commit 2025275 — wires the service layer that drives the
approval workflow. Still no handler / integration wiring; commits 3-4
land that.

Files added:
  internal/service/approval.go         - ApprovalService struct + 6
                                          methods: RequestApproval,
                                          Approve, Reject, ListPending,
                                          List, Get, ExpireStale.
                                          Same-actor RBAC check
                                          (ErrApproveBySameActor) at
                                          both Approve and Reject; the
                                          load-bearing two-person
                                          integrity gate. Bypass mode
                                          short-circuits via
                                          approveInternal(outcome=
                                          "bypassed", actorType=System).
                                          Audit + metric emission per
                                          decision via shared
                                          recordAudit helper. Tolerates
                                          nil AuditService for tests.
                                          Service depends on a narrow
                                          JobStatusUpdater interface
                                          (single-method) rather than
                                          the full repository.JobRepository
                                          — production wiring satisfies
                                          it implicitly via postgres'
                                          existing UpdateStatus.

  internal/service/approval_metrics.go - ApprovalMetrics: thread-safe
                                          counter table (decisions
                                          counter dimensioned by
                                          outcome × profile_id) + a
                                          custom durationHistogram for
                                          pending-age (le buckets:
                                          60, 300, 1800, 3600, 21600,
                                          86400, +Inf — 1m, 5m, 30m,
                                          1h, 6h, 24h, beyond).
                                          Snapshot* methods return the
                                          Prometheus exposer's input
                                          shapes. Mirrors the
                                          ExpiryAlertMetrics +
                                          VaultRenewalMetrics pattern
                                          from prior ranks.

  internal/service/approval_test.go    - 8 table-driven tests with
                                          tight in-package fakes
                                          (fakeApprovalRepo +
                                          fakeJobStateRepo):
                                            TestApproval_RequestCreatesPendingRow_BypassDisabled
                                            TestApproval_BypassMode_AutoApprovesWithSystemBypassActor
                                            TestApproval_Approve_TransitionsJobFromAwaitingApprovalToPending
                                            TestApproval_Reject_TransitionsJobFromAwaitingApprovalToCancelled
                                            TestApproval_Approve_RejectsSameActor
                                              ↑ THE LOAD-BEARING TWO-PERSON
                                                INTEGRITY TEST. PCI-DSS 6.4.5
                                                / NIST 800-53 SA-15 / SOC 2
                                                CC6.1 compliance auditors
                                                pattern-match against this.
                                                Pins same-actor rejection on
                                                both Approve and Reject paths;
                                                pins success when a different
                                                actor approves.
                                            TestApproval_Approve_RejectsAlreadyDecided
                                            TestApproval_ExpireStale_TransitionsPendingToExpired_AndCancelsJob
                                            TestApproval_MetricCounterIncrements

Verified:
  gofmt: clean.
  go vet ./internal/service/...: exit 0.
  go test -short -count=1 -run TestApproval ./internal/service/...:
    ok 0.005s — all 8 tests green.

Out of scope for this commit (lands in commits 3-4):
  - api/handler/approval.go (5 endpoints + handler-side RBAC).
  - api/openapi.yaml extensions.
  - Integration into CertificateService.TriggerRenewal +
    RenewalService.CheckExpiringCertificates + Scheduler.ReapTimedOutJobs.
  - cmd/server/main.go wiring of ApprovalService + ApprovalMetrics.
  - Config.Approval.BypassEnabled + CERTCTL_APPROVAL_BYPASS env var.
  - docs/connectors.md row + docs/approval-workflow.md runbook.

Reference: cowork/rank-7-approval-workflow-primitive-prompt.md.
2026-05-04 01:01:53 +00:00

213 lines
6.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"math"
"sort"
"sync"
"sync/atomic"
)
// ApprovalMetrics is a thread-safe counter table for the issuance
// approval-workflow dispatch path. Rank 7 of the 2026-05-03 Infisical
// deep-research deliverable. Mirrors the ExpiryAlertMetrics +
// VaultRenewalMetrics shape: cmd/server/main.go constructs ONE instance,
// passes it to ApprovalService (recording side) AND metricsHandler
// (exposing side) so the snapshotter is the single source of truth.
//
// Dimensions:
//
// outcome — closed enum from internal/domain/approval.go:
// "approved" — Approve transitioned a pending request.
// "rejected" — Reject transitioned a pending request.
// "expired" — scheduler reaper transitioned a stale
// pending request via ExpireStale.
// "bypassed" — CERTCTL_APPROVAL_BYPASS=true short-
// circuited RequestApproval. Production
// deploys MUST have zero rows of this
// outcome.
// profile_id — CertificateProfile.ID that drove the gate. Bounded
// cardinality (operators have <100 profiles in production).
//
// Cardinality bound: 4 outcomes × N profiles. With N=100, that's 400
// series — well within Prometheus's per-target series budget for a
// well-bounded label.
//
// Pending-age histogram: ObservePendingAge records the seconds-since-
// creation of a pending approval at the moment of decision. Operators
// alert when the p99 hits hours/days (compliance has a deadline).
// Bucket boundaries: 60, 300, 1800, 3600, 21600, 86400, +Inf — 1
// minute, 5 minutes, 30 minutes, 1 hour, 6 hours, 24 hours, beyond.
type ApprovalMetrics struct {
mu sync.RWMutex
counters map[approvalKey]*atomic.Uint64
pendingAgeHist *approvalDurationHistogram
}
type approvalKey struct {
Outcome string
ProfileID string
}
// NewApprovalMetrics returns a zero-value ApprovalMetrics ready for
// concurrent use. The caller MUST register the same instance on both
// the ApprovalService (recording) and the MetricsHandler (exposing)
// sides.
func NewApprovalMetrics() *ApprovalMetrics {
return &ApprovalMetrics{
counters: make(map[approvalKey]*atomic.Uint64),
pendingAgeHist: newApprovalDurationHistogram(),
}
}
// RecordDecision bumps the (outcome, profile_id) counter by one. Called
// from ApprovalService.Approve / Reject / ExpireStale and from the
// bypass-mode short-circuit inside RequestApproval.
func (m *ApprovalMetrics) RecordDecision(outcome, profileID string) {
if m == nil {
return
}
key := approvalKey{Outcome: outcome, ProfileID: profileID}
m.mu.RLock()
c, ok := m.counters[key]
m.mu.RUnlock()
if !ok {
m.mu.Lock()
c, ok = m.counters[key]
if !ok {
c = &atomic.Uint64{}
m.counters[key] = c
}
m.mu.Unlock()
}
c.Add(1)
}
// ObservePendingAge records the seconds-since-creation of a pending
// approval at the moment of decision (Approve / Reject / Expire).
func (m *ApprovalMetrics) ObservePendingAge(seconds float64) {
if m == nil {
return
}
m.pendingAgeHist.observe(seconds)
}
// SnapshotApprovalDecisions returns the current decision counter table
// as a sorted slice for deterministic Prometheus exposition. Sort key
// is (outcome, profile_id).
type ApprovalDecisionEntry struct {
Outcome string
ProfileID string
Count uint64
}
func (m *ApprovalMetrics) SnapshotApprovalDecisions() []ApprovalDecisionEntry {
if m == nil {
return nil
}
m.mu.RLock()
out := make([]ApprovalDecisionEntry, 0, len(m.counters))
for k, c := range m.counters {
out = append(out, ApprovalDecisionEntry{
Outcome: k.Outcome,
ProfileID: k.ProfileID,
Count: c.Load(),
})
}
m.mu.RUnlock()
sort.Slice(out, func(i, j int) bool {
if out[i].Outcome != out[j].Outcome {
return out[i].Outcome < out[j].Outcome
}
return out[i].ProfileID < out[j].ProfileID
})
return out
}
// SnapshotApprovalPendingAgeHistogram returns the current bucket counts
// + sum + total count for the pending-age histogram. Format suits the
// Prometheus histogram exposition (le buckets + _sum + _count).
type ApprovalPendingAgeSnapshot struct {
BucketBounds []float64 // [60, 300, 1800, 3600, 21600, 86400] — exclusive of +Inf
BucketCounts []uint64 // cumulative counts per bucket; len = len(BucketBounds) + 1 (last is +Inf)
Sum float64
Count uint64
}
func (m *ApprovalMetrics) SnapshotApprovalPendingAgeHistogram() ApprovalPendingAgeSnapshot {
if m == nil {
return ApprovalPendingAgeSnapshot{}
}
return m.pendingAgeHist.snapshot()
}
// approvalDurationHistogram is a tiny lock-free histogram with fixed
// bucket boundaries for approval-pending-age. Atomic counters per
// bucket + sum stored as uint64-bits-of-float64 atomic.
type approvalDurationHistogram struct {
bounds []float64
buckets []*atomic.Uint64 // len = len(bounds) + 1; last is +Inf
sumBits *atomic.Uint64 // float64 bits stored atomically
count *atomic.Uint64
}
func newApprovalDurationHistogram() *approvalDurationHistogram {
bounds := []float64{60, 300, 1800, 3600, 21600, 86400}
buckets := make([]*atomic.Uint64, len(bounds)+1)
for i := range buckets {
buckets[i] = &atomic.Uint64{}
}
return &approvalDurationHistogram{
bounds: bounds,
buckets: buckets,
sumBits: &atomic.Uint64{},
count: &atomic.Uint64{},
}
}
func (h *approvalDurationHistogram) observe(seconds float64) {
if h == nil {
return
}
// Find the first bucket whose bound is >= seconds.
idx := len(h.bounds) // default to +Inf bucket
for i, b := range h.bounds {
if seconds <= b {
idx = i
break
}
}
h.buckets[idx].Add(1)
h.count.Add(1)
// Atomic float64 add via CAS loop.
for {
oldBits := h.sumBits.Load()
old := math.Float64frombits(oldBits)
newBits := math.Float64bits(old + seconds)
if h.sumBits.CompareAndSwap(oldBits, newBits) {
return
}
}
}
func (h *approvalDurationHistogram) snapshot() ApprovalPendingAgeSnapshot {
if h == nil {
return ApprovalPendingAgeSnapshot{}
}
counts := make([]uint64, len(h.buckets))
cumulative := uint64(0)
for i, b := range h.buckets {
cumulative += b.Load()
counts[i] = cumulative
}
return ApprovalPendingAgeSnapshot{
BucketBounds: append([]float64(nil), h.bounds...),
BucketCounts: counts,
Sum: math.Float64frombits(h.sumBits.Load()),
Count: h.count.Load(),
}
}