mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 16:21:30 +00:00
c4ed3da30b
Sprint 6 push (commits43836ac+663b14b) tripped three CI guards. Fixing all three in this single follow-up — each is a small, mechanical correction that doesn't change behavior: 1. staticcheck ST1021: AuditChainSnapshot doc comment was on the wrong type. internal/service/audit_chain_metric.go:91 had: // Snapshot returns the current counter state for the Prometheus // exposer. Reads use atomic loads — no mutex. type AuditChainSnapshot struct { ... } The comment described Snapshot() (the method on AuditChainCounter) but sat directly above the AuditChainSnapshot struct. staticcheck ST1021 requires exported-type comments to start with the type's name + optional leading article. Rewrote to lead with "AuditChainSnapshot is the point-in-time view ...". 2. multi-tenant-query-coverage: baseline drifted 31 → 32 because Sprint 6 COMP-002-RETENTION added UserRepository.ListDeactivatedBefore at internal/repository/postgres/user.go:191 — legitimately tenant-spanning by design. The retention policy is control-plane-wide (one CERTCTL_USER_RETENTION_WINDOW for the whole deployment, not per-tenant). The scheduler's userRetentionLoop walks every tenant's deactivated users on the same tick. A per-tenant tenant_id filter would require the scheduler to iterate every tenant — more code for equivalent semantics. Per the guard's own documentation (option b), legitimately tenant-spanning queries get an inline rationale comment + a baseline lift. Both delivered: - Inline comment block on the SELECT in user.go::ListDeactivatedBefore. - BASELINE_COUNT 31 → 32 in scripts/ci-guards/multi-tenant-query-coverage.sh, with the Sprint 6 rebase entry added to the rebase-history comment. 3. skip-inventory-drift: docs/testing/skip-inventory.md was stale. COMP-001-HASH added three new t.Skip sites in internal/repository/postgres/audit_chain_test.go (the three testing.Short() gates on the testcontainers integration tests). Re-ran ./scripts/skip-inventory.sh to regenerate the doc — totals went from 144 → 147 sites + 78 → 82 short-mode guards. Verified locally: bash scripts/ci-guards/multi-tenant-query-coverage.sh (clean) bash scripts/ci-guards/skip-inventory-drift.sh (clean) go vet ./... (clean) staticcheck ./internal/service/... (clean) Closes the three Sprint 6 CI failures. The next CI run should green out.
120 lines
4.4 KiB
Go
120 lines
4.4 KiB
Go
// Copyright 2026 certctl LLC. All rights reserved.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package service
|
|
|
|
import (
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
// AuditChainCounter is the metric-side companion to the Sprint 6
|
|
// COMP-001-HASH chain verifier. The scheduler's auditChainVerifyLoop
|
|
// calls RecordSuccess on every clean walk and RecordBreak on
|
|
// detection; the Prometheus metrics handler reads the snapshot.
|
|
//
|
|
// Wire shape:
|
|
//
|
|
// scheduler.AuditChainVerifier → *postgres.AuditRepository
|
|
// (calls audit_events_verify_chain SQL func)
|
|
// scheduler.AuditChainBreakRecorder → *AuditChainCounter (this file)
|
|
// handler.MetricsHandler → reads Snapshot() / LastBreakID() / ...
|
|
//
|
|
// Three counters get surfaced (matching the existing
|
|
// /api/v1/metrics/prometheus naming conventions):
|
|
//
|
|
// certctl_audit_chain_break_detected_total counter (cumulative)
|
|
// certctl_audit_chain_verify_total counter (every walk)
|
|
// certctl_audit_chain_rows gauge (last walk's row count)
|
|
//
|
|
// Plus three info-label fields (broken_at_id, broken_at_pos,
|
|
// last_verified_at_unix) so operators can render a
|
|
// "last walk: clean, 1.2M rows, T-37m" panel.
|
|
//
|
|
// The counters use atomic.Uint64 so writes from the scheduler
|
|
// goroutine and reads from the HTTP handler goroutine don't need a
|
|
// mutex. The string fields (broken_at_id) are guarded by a
|
|
// dedicated mutex because atomic.Pointer would force the caller to
|
|
// re-allocate on every set.
|
|
type AuditChainCounter struct {
|
|
breaksDetected atomic.Uint64
|
|
walksCompleted atomic.Uint64
|
|
lastRowCount atomic.Uint64
|
|
lastVerifiedAt atomic.Int64 // unix seconds; 0 = never
|
|
|
|
// brokenAtID / brokenAtPos are sticky — they record the *first*
|
|
// detected break, not the most recent walk's data. Operators
|
|
// reset by restarting the process (or a future Phase 2 reset
|
|
// endpoint behind auth.audit.admin).
|
|
brokenAtID atomic.Value // string
|
|
brokenAtPos atomic.Int64
|
|
}
|
|
|
|
// NewAuditChainCounter returns a zero-state counter. Wire from
|
|
// cmd/server/main.go and pass to both the scheduler
|
|
// (SetAuditChainBreakRecorder) and the metrics handler
|
|
// (SetAuditChainCounter).
|
|
func NewAuditChainCounter() *AuditChainCounter {
|
|
c := &AuditChainCounter{}
|
|
c.brokenAtID.Store("")
|
|
c.brokenAtPos.Store(-1)
|
|
return c
|
|
}
|
|
|
|
// RecordSuccess marks a clean walk. The scheduler calls this on every
|
|
// tick where VerifyHashChain returned brokenAtID == "".
|
|
func (c *AuditChainCounter) RecordSuccess(rowCount int) {
|
|
c.walksCompleted.Add(1)
|
|
if rowCount < 0 {
|
|
rowCount = 0
|
|
}
|
|
c.lastRowCount.Store(uint64(rowCount))
|
|
c.lastVerifiedAt.Store(time.Now().Unix())
|
|
}
|
|
|
|
// RecordBreak marks a detected break. Sticky: subsequent breaks do not
|
|
// overwrite the (brokenAtID, brokenAtPos) fields — the first detection
|
|
// is the actionable signal. The breaksDetected counter still
|
|
// increments on every observation so operators can tell whether the
|
|
// tampering is ongoing or one-shot.
|
|
func (c *AuditChainCounter) RecordBreak(brokenAtID string, brokenAtPos int) {
|
|
c.breaksDetected.Add(1)
|
|
c.walksCompleted.Add(1)
|
|
c.lastVerifiedAt.Store(time.Now().Unix())
|
|
// Sticky-first-detection — only record if the field is still empty.
|
|
if cur, _ := c.brokenAtID.Load().(string); cur == "" {
|
|
c.brokenAtID.Store(brokenAtID)
|
|
c.brokenAtPos.Store(int64(brokenAtPos))
|
|
}
|
|
}
|
|
|
|
// AuditChainSnapshot is the point-in-time view of the counters the
|
|
// Prometheus exposer reads. Snapshot() returns one of these; the
|
|
// metrics handler renders each field into Prometheus exposition
|
|
// format. Reads use atomic loads — no mutex required.
|
|
type AuditChainSnapshot struct {
|
|
BreaksDetected uint64
|
|
WalksCompleted uint64
|
|
LastRowCount uint64
|
|
// LastVerifiedAtUnix is 0 if the loop has never run; otherwise the
|
|
// unix-epoch second of the most recent walk (clean or break).
|
|
LastVerifiedAtUnix int64
|
|
// BrokenAtID is "" if no break has ever been recorded.
|
|
BrokenAtID string
|
|
BrokenAtPos int64
|
|
}
|
|
|
|
// Snapshot returns a point-in-time view of every counter. The metrics
|
|
// handler renders this into Prometheus exposition format.
|
|
func (c *AuditChainCounter) Snapshot() AuditChainSnapshot {
|
|
id, _ := c.brokenAtID.Load().(string)
|
|
return AuditChainSnapshot{
|
|
BreaksDetected: c.breaksDetected.Load(),
|
|
WalksCompleted: c.walksCompleted.Load(),
|
|
LastRowCount: c.lastRowCount.Load(),
|
|
LastVerifiedAtUnix: c.lastVerifiedAt.Load(),
|
|
BrokenAtID: id,
|
|
BrokenAtPos: c.brokenAtPos.Load(),
|
|
}
|
|
}
|