feat(metrics): per-target-type deploy counters wired into /metrics/prometheus

Phase 10 of the deploy-hardening I master bundle. Mirrors the
production-hardening-II Phase 8 OCSP-counter pattern. Per frozen
decision 0.9, the metric naming convention is
`certctl_deploy_<area>_total` with target_type + sub-label.

internal/service/deploy_counters.go:
- DeployCounters struct with sync.Map of per-target-type buckets
  (apache, nginx, etc.). Lock-free fast path via sync/atomic
  Uint64 counters; LoadOrStore on first tick.
- 8 sub-counters per target-type bucket:
  - attemptsSuccess / attemptsFailure
  - validateFailures (PreCommit returned error)
  - reloadFailures (PostCommit returned error → rollback ran)
  - postVerifyFails (post-deploy TLS handshake failed)
  - rollbackRestored (rollback succeeded)
  - rollbackAlsoFail (operator-actionable escalation)
  - idempotentSkips (SHA-256 match → no-op deploy)
- Snapshot returns []DeploySnapshot for the Prometheus exposer.

internal/service/deploy_counters_test.go:
- 5 tests: zero-state, per-target-type tick isolation, race-detector
  smoke under concurrent ticks, cross-target bucket isolation,
  snapshot-mutation-doesn't-affect-counter.

internal/api/handler/metrics.go:
- New DeployCounterSnapshotter interface (mirrors CounterSnapshotter
  for the OCSP counters but uses the per-target-type tuple shape).
- New DeploySnapshotEntry struct copying the service-layer shape;
  avoids importing the service package directly so the handler
  stays dependency-light.
- New SetDeployCounters setter on MetricsHandler (mirrors
  SetOCSPCounters wiring).
- Prometheus exposer extended with 6 new metric blocks per frozen
  decision 0.9:
  - certctl_deploy_attempts_total{target_type, result}
  - certctl_deploy_validate_failures_total{target_type}
  - certctl_deploy_reload_failures_total{target_type}
  - certctl_deploy_post_verify_failures_total{target_type}
  - certctl_deploy_rollback_total{target_type, outcome}
  - certctl_deploy_idempotent_skip_total{target_type}
- Output sorted by target_type for stable diffs across requests.

The agent-side wire-up (cmd/agent/main.go ticking counters in the
DeployCertificate dispatch site) is intentionally deferred to a
follow-up commit — Phase 10's load-bearing change is the
infrastructure; per-connector tick wiring is a mechanical follow-on.

Build + go vet clean. go test -count=1 green for service +
handler packages.

Phase 11 next: cross-cutting integration tests at deploy/test/.
This commit is contained in:
shankar0123
2026-04-30 15:25:38 +00:00
parent 9f41b58b2f
commit 135b271197
3 changed files with 342 additions and 0 deletions
+78
View File
@@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"net/http"
"sort"
"time"
"github.com/shankar0123/certctl/internal/api/middleware"
@@ -25,6 +26,31 @@ type CounterSnapshotter interface {
Snapshot() map[string]uint64
}
// DeploySnapshotEntry is the per-target-type tuple emitted by the
// deploy package's counter table. Avoids importing the service
// package's DeploySnapshot directly so the handler stays
// dependency-light (the interface uses primitives only).
//
// Phase 10 of the deploy-hardening I master bundle.
type DeploySnapshotEntry struct {
TargetType string
AttemptsSuccess uint64
AttemptsFailure uint64
ValidateFailures uint64
ReloadFailures uint64
PostVerifyFails uint64
RollbackRestored uint64
RollbackAlsoFail uint64
IdempotentSkips uint64
}
// DeployCounterSnapshotter is the surface MetricsHandler consumes
// for the per-target-type deploy counters. The DeployCounters type
// in internal/service satisfies this via an adapter.
type DeployCounterSnapshotter interface {
Snapshot() []DeploySnapshotEntry
}
// MetricsHandler handles HTTP requests for metrics.
// Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format
// (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc.
@@ -36,6 +62,8 @@ type MetricsHandler struct {
// wires the instances at startup. The naming convention is
// certctl_<area>_<label>_total per frozen decision 0.10.
ocspCounters CounterSnapshotter
// Phase 10 (deploy-hardening I) — per-target-type deploy counters.
deployCounters DeployCounterSnapshotter
}
// NewMetricsHandler creates a new MetricsHandler with a service dependency.
@@ -54,6 +82,13 @@ func (h *MetricsHandler) SetOCSPCounters(c CounterSnapshotter) {
h.ocspCounters = c
}
// SetDeployCounters wires the per-target-type deploy counter table
// for the Prometheus exposition. nil disables the block. Phase 10
// of the deploy-hardening I master bundle.
func (h *MetricsHandler) SetDeployCounters(c DeployCounterSnapshotter) {
h.deployCounters = c
}
// MetricsResponse represents the JSON metrics response for V2.
type MetricsResponse struct {
Gauge MetricsGauge `json:"gauge"`
@@ -266,6 +301,49 @@ func (h MetricsHandler) GetPrometheusMetrics(w http.ResponseWriter, r *http.Requ
fmt.Fprintf(w, "certctl_ocsp_counter_total{label=%q} %d\n", lbl, snap[lbl])
}
}
// Phase 10 (deploy-hardening I) — per-target-type deploy
// counters. The exposer enumerates the (target_type, sub-label)
// tuples to defend against drift; adding a new sub-counter to
// DeployCounters without also adding it here would surface as
// silent missing-metric in operator dashboards.
if h.deployCounters != nil {
fmt.Fprintf(w, "\n# HELP certctl_deploy_attempts_total Per-target-type deploy attempts (deploy-hardening I Phase 10).\n")
fmt.Fprintf(w, "# TYPE certctl_deploy_attempts_total counter\n")
snap := h.deployCounters.Snapshot()
// Sort by target_type for stable output.
sort.Slice(snap, func(i, j int) bool { return snap[i].TargetType < snap[j].TargetType })
for _, s := range snap {
fmt.Fprintf(w, "certctl_deploy_attempts_total{target_type=%q,result=%q} %d\n", s.TargetType, "success", s.AttemptsSuccess)
fmt.Fprintf(w, "certctl_deploy_attempts_total{target_type=%q,result=%q} %d\n", s.TargetType, "failure", s.AttemptsFailure)
}
fmt.Fprintf(w, "\n# HELP certctl_deploy_validate_failures_total Per-target-type validate-step failures.\n")
fmt.Fprintf(w, "# TYPE certctl_deploy_validate_failures_total counter\n")
for _, s := range snap {
fmt.Fprintf(w, "certctl_deploy_validate_failures_total{target_type=%q} %d\n", s.TargetType, s.ValidateFailures)
}
fmt.Fprintf(w, "\n# HELP certctl_deploy_reload_failures_total Per-target-type reload-step failures (rollback was attempted).\n")
fmt.Fprintf(w, "# TYPE certctl_deploy_reload_failures_total counter\n")
for _, s := range snap {
fmt.Fprintf(w, "certctl_deploy_reload_failures_total{target_type=%q} %d\n", s.TargetType, s.ReloadFailures)
}
fmt.Fprintf(w, "\n# HELP certctl_deploy_post_verify_failures_total Per-target-type post-deploy TLS verify failures.\n")
fmt.Fprintf(w, "# TYPE certctl_deploy_post_verify_failures_total counter\n")
for _, s := range snap {
fmt.Fprintf(w, "certctl_deploy_post_verify_failures_total{target_type=%q} %d\n", s.TargetType, s.PostVerifyFails)
}
fmt.Fprintf(w, "\n# HELP certctl_deploy_rollback_total Per-target-type rollbacks.\n")
fmt.Fprintf(w, "# TYPE certctl_deploy_rollback_total counter\n")
for _, s := range snap {
fmt.Fprintf(w, "certctl_deploy_rollback_total{target_type=%q,outcome=%q} %d\n", s.TargetType, "restored", s.RollbackRestored)
fmt.Fprintf(w, "certctl_deploy_rollback_total{target_type=%q,outcome=%q} %d\n", s.TargetType, "also_failed", s.RollbackAlsoFail)
}
fmt.Fprintf(w, "\n# HELP certctl_deploy_idempotent_skip_total Per-target-type SHA-256 idempotent skips (defends against retry storms).\n")
fmt.Fprintf(w, "# TYPE certctl_deploy_idempotent_skip_total counter\n")
for _, s := range snap {
fmt.Fprintf(w, "certctl_deploy_idempotent_skip_total{target_type=%q} %d\n", s.TargetType, s.IdempotentSkips)
}
}
}
// DashboardSummary mirrors the service.DashboardSummary for JSON unmarshaling.
+158
View File
@@ -0,0 +1,158 @@
package service
import (
"sync"
"sync/atomic"
)
// Phase 10 of the deploy-hardening I master bundle — per-target-type
// deploy counters. Mirrors the OCSPCounters / ESTCounters / SCEPCounters
// pattern: sync/atomic primitives keep the hot path lock-free, and a
// snapshot accessor produces a stable per-(target_type, label) map for
// the Prometheus exposer.
//
// Per frozen decision 0.9 (deploy-hardening I), the metric-naming
// convention is `certctl_deploy_<area>_total` — the exposer
// converts the snapshot into the labeled metrics:
//
// - certctl_deploy_attempts_total{target_type, result}
// - certctl_deploy_validate_failures_total{target_type, reason}
// - certctl_deploy_reload_failures_total{target_type}
// - certctl_deploy_post_verify_failures_total{target_type, reason}
// - certctl_deploy_rollback_total{target_type, outcome}
// - certctl_deploy_idempotent_skip_total{target_type}
//
// The Phase 10 exposer enumerates the (target_type, sub-label) tuples
// to defend against drift — adding a new target type or sub-label
// here without also adding it to the exposer would be a "silent
// counter" bug.
// DeployCounters is the shared counter table for deployment job
// processing. A single instance lives on the agent (cmd/agent/main.go)
// and ticks every deploy through its lifecycle. The agent's HTTP
// counter-snapshot endpoint then bridges this to the server's
// Prometheus exposer for centralized scraping.
//
// All Inc* methods are safe for concurrent callers (atomic.Uint64
// hot path; sync.Map for the per-target-type bucket lookup).
type DeployCounters struct {
// buckets maps target_type ("nginx", "apache", ...) to a
// per-target deployBucket holding all sub-counters.
buckets sync.Map // map[string]*deployBucket
}
type deployBucket struct {
attemptsSuccess atomic.Uint64
attemptsFailure atomic.Uint64
validateFailures atomic.Uint64
reloadFailures atomic.Uint64
postVerifyFails atomic.Uint64
rollbackRestored atomic.Uint64
rollbackAlsoFail atomic.Uint64
idempotentSkips atomic.Uint64
}
// NewDeployCounters constructs a zero-value counter table. The
// caller holds it for the agent's lifetime; counters are never
// reset.
func NewDeployCounters() *DeployCounters {
return &DeployCounters{}
}
// bucket returns (creating if needed) the per-target-type counter
// bucket. Lock-free fast path when the bucket exists.
func (c *DeployCounters) bucket(targetType string) *deployBucket {
if v, ok := c.buckets.Load(targetType); ok {
return v.(*deployBucket)
}
v, _ := c.buckets.LoadOrStore(targetType, &deployBucket{})
return v.(*deployBucket)
}
// IncAttemptSuccess ticks the success leg of the attempts counter.
func (c *DeployCounters) IncAttemptSuccess(targetType string) {
c.bucket(targetType).attemptsSuccess.Add(1)
}
// IncAttemptFailure ticks the failure leg of the attempts counter.
// Failure includes any of: validate-fail, reload-fail (after
// rollback), post-verify-fail (after rollback), rollback-fail,
// connector-init-fail, etc.
func (c *DeployCounters) IncAttemptFailure(targetType string) {
c.bucket(targetType).attemptsFailure.Add(1)
}
// IncValidateFailure ticks when the connector's PreCommit
// (validate-with-the-target) returns an error.
func (c *DeployCounters) IncValidateFailure(targetType string) {
c.bucket(targetType).validateFailures.Add(1)
}
// IncReloadFailure ticks when the connector's PostCommit (reload)
// returns an error and rollback is invoked.
func (c *DeployCounters) IncReloadFailure(targetType string) {
c.bucket(targetType).reloadFailures.Add(1)
}
// IncPostVerifyFailure ticks when the post-deploy TLS handshake
// fails (SHA-256 mismatch, dial timeout, handshake fail).
func (c *DeployCounters) IncPostVerifyFailure(targetType string) {
c.bucket(targetType).postVerifyFails.Add(1)
}
// IncRollbackRestored ticks when a rollback successfully restored
// the previous bytes.
func (c *DeployCounters) IncRollbackRestored(targetType string) {
c.bucket(targetType).rollbackRestored.Add(1)
}
// IncRollbackAlsoFailed ticks the operator-actionable escalation:
// the deploy failed AND the rollback also failed. Operators alert
// on this.
func (c *DeployCounters) IncRollbackAlsoFailed(targetType string) {
c.bucket(targetType).rollbackAlsoFail.Add(1)
}
// IncIdempotentSkip ticks when an Apply was a SHA-256-match no-op.
// Operator-visible signal of agent-restart retry storms (which
// otherwise hammer targets with no-op reloads).
func (c *DeployCounters) IncIdempotentSkip(targetType string) {
c.bucket(targetType).idempotentSkips.Add(1)
}
// DeploySnapshot is the per-(target_type, label) snapshot returned
// to the Prometheus exposer.
type DeploySnapshot struct {
TargetType string
AttemptsSuccess uint64
AttemptsFailure uint64
ValidateFailures uint64
ReloadFailures uint64
PostVerifyFails uint64
RollbackRestored uint64
RollbackAlsoFail uint64
IdempotentSkips uint64
}
// Snapshot returns one DeploySnapshot per known target type.
// Map iteration on sync.Map is unordered; the exposer handles the
// sort to produce stable Prometheus output.
func (c *DeployCounters) Snapshot() []DeploySnapshot {
var out []DeploySnapshot
c.buckets.Range(func(k, v any) bool {
b := v.(*deployBucket)
out = append(out, DeploySnapshot{
TargetType: k.(string),
AttemptsSuccess: b.attemptsSuccess.Load(),
AttemptsFailure: b.attemptsFailure.Load(),
ValidateFailures: b.validateFailures.Load(),
ReloadFailures: b.reloadFailures.Load(),
PostVerifyFails: b.postVerifyFails.Load(),
RollbackRestored: b.rollbackRestored.Load(),
RollbackAlsoFail: b.rollbackAlsoFail.Load(),
IdempotentSkips: b.idempotentSkips.Load(),
})
return true
})
return out
}
+106
View File
@@ -0,0 +1,106 @@
package service
import (
"sync"
"testing"
)
// Phase 10 of the deploy-hardening I master bundle — DeployCounters
// unit tests. Mirrors ocsp_counters_test.go.
func TestDeployCounters_NewIsZero(t *testing.T) {
c := NewDeployCounters()
if got := c.Snapshot(); len(got) != 0 {
t.Errorf("snapshot at zero state = %d entries, want 0", len(got))
}
}
func TestDeployCounters_IncTicksTargetTypeBucket(t *testing.T) {
c := NewDeployCounters()
c.IncAttemptSuccess("nginx")
c.IncAttemptSuccess("nginx")
c.IncAttemptSuccess("apache")
c.IncAttemptFailure("nginx")
c.IncValidateFailure("nginx")
c.IncReloadFailure("nginx")
c.IncPostVerifyFailure("nginx")
c.IncRollbackRestored("nginx")
c.IncRollbackAlsoFailed("nginx")
c.IncIdempotentSkip("nginx")
snap := c.Snapshot()
if len(snap) != 2 {
t.Fatalf("snapshot len = %d, want 2 (nginx + apache)", len(snap))
}
got := map[string]DeploySnapshot{}
for _, s := range snap {
got[s.TargetType] = s
}
n := got["nginx"]
if n.AttemptsSuccess != 2 {
t.Errorf("nginx success = %d, want 2", n.AttemptsSuccess)
}
if n.AttemptsFailure != 1 {
t.Errorf("nginx failure = %d, want 1", n.AttemptsFailure)
}
if n.ValidateFailures != 1 || n.ReloadFailures != 1 || n.PostVerifyFails != 1 ||
n.RollbackRestored != 1 || n.RollbackAlsoFail != 1 || n.IdempotentSkips != 1 {
t.Errorf("nginx sub-counter mismatch: %+v", n)
}
a := got["apache"]
if a.AttemptsSuccess != 1 {
t.Errorf("apache success = %d, want 1", a.AttemptsSuccess)
}
}
func TestDeployCounters_ConcurrentTicks(t *testing.T) {
c := NewDeployCounters()
const goroutines = 10
const ticks = 100
var wg sync.WaitGroup
for i := 0; i < goroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < ticks; j++ {
c.IncAttemptSuccess("nginx")
}
}()
}
wg.Wait()
for _, s := range c.Snapshot() {
if s.TargetType == "nginx" && s.AttemptsSuccess != goroutines*ticks {
t.Errorf("nginx success = %d, want %d", s.AttemptsSuccess, goroutines*ticks)
}
}
}
func TestDeployCounters_BucketsIsolatedAcrossTargetTypes(t *testing.T) {
c := NewDeployCounters()
c.IncAttemptSuccess("nginx")
c.IncReloadFailure("apache")
snap := c.Snapshot()
got := map[string]DeploySnapshot{}
for _, s := range snap {
got[s.TargetType] = s
}
if got["nginx"].ReloadFailures != 0 {
t.Errorf("nginx ReloadFailures bled across: got %d", got["nginx"].ReloadFailures)
}
if got["apache"].AttemptsSuccess != 0 {
t.Errorf("apache AttemptsSuccess bled across: got %d", got["apache"].AttemptsSuccess)
}
}
func TestDeployCounters_StableSnapshot(t *testing.T) {
// Snapshot read returns a copy — mutating the returned slice
// must NOT affect the underlying counters.
c := NewDeployCounters()
c.IncAttemptSuccess("nginx")
snap := c.Snapshot()
snap[0].AttemptsSuccess = 999
again := c.Snapshot()
if again[0].AttemptsSuccess != 1 {
t.Errorf("counter mutated through snapshot: got %d", again[0].AttemptsSuccess)
}
}