mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 18:01:37 +00:00
8191b1ee64
Phase 6 of the certctl architecture diligence remediation. Five
findings across the same scheduler-and-DB-pool surface.
SCALE-M1 (Med) — DB pool default bumped 25 → 50
internal/config/config.go line 1972:
MaxConnections: getEnvInt("CERTCTL_DATABASE_MAX_CONNS", 50)
Postgres default max_connections is 100; 50 leaves headroom for
pg_dump + ad-hoc psql + a server replica without exhausting the
DB-side cap. Operator override env var unchanged. Operator-tune
ladder for larger fleets (5K / 50K certs) lives in
docs/operator/scale.md as starter values pending Phase 8 load
tests — explicitly marked TBD.
SCALE-M3 (Med) — async-CA poll budget operator-configurable
Live state was partially-already-shipped: all 4 async-CA
connectors (digicert, entrust, globalsign, sectigo) already have
per-connector CERTCTL_<NAME>_POLL_MAX_WAIT_SECONDS (Audit fix #5
closed pre-Phase-6). What was missing: a global package-default
override. Shipped:
- internal/connector/issuer/asyncpoll/asyncpoll.go gains
SetDefaultMaxWait(d) + effectiveDefaultMaxWait var + the
currentDefaultMaxWait() priority resolver.
- cmd/server/main.go reads CERTCTL_ASYNC_POLL_MAX_WAIT_SECONDS
at boot and calls SetDefaultMaxWait.
- deploy/ENVIRONMENTS.md documents the new env var (G-3 guard
green).
Naming deviation from the prompt's CERTCTL_ASYNC_POLL_MAX_ATTEMPTS:
the live code tracks wall-clock time (MaxWait), not attempt count.
Matched the existing per-connector nomenclature (_POLL_MAX_WAIT_SECONDS)
so the priority chain reads naturally.
SCALE-M5 (Med) — JitteredTicker wrapper for all 15 scheduler loops
internal/scheduler/jitter.go ships NewJitteredTicker(interval,
jitterPct) + DefaultSchedulerJitter (±10%). All 15 sites in
internal/scheduler/scheduler.go migrated from bare time.NewTicker
to NewJitteredTicker(interval, DefaultSchedulerJitter). Base
intervals unchanged; only the per-tick envelope adds ±10%
randomized delay so multiple loops with the same nominal cadence
don't co-fire and spike CPU + DB at wall-clock boundaries.
internal/scheduler/jitter_test.go pins:
- Bounded envelope (each tick within ±jitterPct of interval)
- Mean drift < 30% of nominal (sign-bug detector)
- Stop() releases the goroutine + closes C
- Stop() idempotent (no panic on repeat)
- Zero-jitter behaves like time.NewTicker
- Negative and >=1 jitterPct values clamped defensively
CI guard scripts/ci-guards/no-bare-newticker-in-scheduler.sh blocks
any future bare time.NewTicker in scheduler.go.
SCALE-L1 (Low) — renewal-sweep semaphore behavior documented
docs/operator/scale.md "Scheduler tick budgets" section explains
the per-tick concurrency semaphore (CERTCTL_RENEWAL_CONCURRENCY=25
default), the ctx-cancellation drain on tick-budget overrun, and
operator tuning advice (raise concurrency + DB pool together).
No code change — the behavior is defensible as-is per the audit.
SCALE-L2 (Low) — ETag middleware for top-5 read endpoints
internal/api/middleware/etag.go computes SHA-256 ETag over the
buffered response body, respects If-None-Match, short-circuits
to 304 Not Modified on match. GET/HEAD only; non-2xx responses
pass through unchanged. 64 KiB buffer cap degrades gracefully on
oversized responses (no caching, body still flushes intact).
Wired around the top-5 read endpoints via etagged() helper in
internal/api/router/router.go:
GET /api/v1/certificates
GET /api/v1/agents
GET /api/v1/jobs
GET /api/v1/audit
GET /api/v1/discovered-certificates
internal/api/middleware/etag_test.go pins 11 behaviors including
304-on-repeat, 200-after-mutation-with-new-ETag, POST bypass,
4xx/5xx pass-through, oversized-response degradation, wildcard
match, HEAD-treated-like-GET, byte-equal pass-through.
Cross-cutting fixes:
- internal/config/config_test.go::TestLoad_DefaultValues updated
to assert the new 50 default (was 25).
- deploy/helm/certctl/values.yaml comment corrected — agent
pollInterval is hardcoded 30s, not env-configurable; the
Phase 4 comment mistakenly referenced CERTCTL_AGENT_POLL_INTERVAL
which G-3 caught as a phantom env var.
- asyncpoll.go reformatted by gofmt; functionally unchanged.
Verification (all pass):
grep -nE 'SetMaxOpenConns' internal/repository/postgres/db.go # finds 1 site
grep -nE 'CERTCTL_DATABASE_MAX_CONNS.*50' internal/config/config.go # config default is 50
grep -rnE 'CERTCTL_ASYNC_POLL_MAX_WAIT_SECONDS' internal/ deploy/ENVIRONMENTS.md # wired
grep -cE 'time\.NewTicker\(' internal/scheduler/scheduler.go # 0 (all migrated)
grep -cE 'JitteredTicker' internal/scheduler/scheduler.go # 15
ls internal/scheduler/jitter.go internal/api/middleware/etag.go # both exist
ls docs/operator/scale.md # exists
bash scripts/ci-guards/no-bare-newticker-in-scheduler.sh # clean
bash scripts/ci-guards/G-3-env-docs-drift.sh # clean
go test ./internal/scheduler/ ./internal/api/middleware/ \
./internal/connector/issuer/asyncpoll/ ./internal/config/ # 4/4 packages green
Closes: cowork/certctl-architecture-diligence-audit.html#fix-SCALE-M1
cowork/certctl-architecture-diligence-audit.html#fix-SCALE-M3
cowork/certctl-architecture-diligence-audit.html#fix-SCALE-M5
cowork/certctl-architecture-diligence-audit.html#fix-SCALE-L1
cowork/certctl-architecture-diligence-audit.html#fix-SCALE-L2
199 lines
5.7 KiB
Go
199 lines
5.7 KiB
Go
// Copyright 2026 certctl LLC. All rights reserved.
|
||
// SPDX-License-Identifier: BUSL-1.1
|
||
|
||
package scheduler
|
||
|
||
import (
|
||
"math"
|
||
"testing"
|
||
"time"
|
||
)
|
||
|
||
// Phase 6 SCALE-M5 contract pin (2026-05-14): JitteredTicker fires
|
||
// ~interval per tick with a bounded ±jitterPct envelope. The tests
|
||
// below are timing-sensitive but use generous tolerances + averaging
|
||
// across many ticks to stay stable under CI load.
|
||
|
||
func TestJitteredTicker_BoundedEnvelope(t *testing.T) {
|
||
const (
|
||
interval = 20 * time.Millisecond
|
||
jitterPct = 0.20 // ±20%
|
||
ticks = 30
|
||
)
|
||
|
||
jt := NewJitteredTicker(interval, jitterPct)
|
||
defer jt.Stop()
|
||
|
||
last := time.Now()
|
||
for i := 0; i < ticks; i++ {
|
||
select {
|
||
case now := <-jt.C:
|
||
gap := now.Sub(last)
|
||
last = now
|
||
|
||
// Bounded envelope: every tick should fall within
|
||
// [interval × (1-jitter), interval × (1+jitter)] plus a
|
||
// generous scheduling-slop tolerance for the test
|
||
// runtime. The first tick is allowed wider slop since
|
||
// goroutine startup may eat into the first interval.
|
||
minGap := time.Duration(float64(interval) * (1 - jitterPct))
|
||
maxGap := time.Duration(float64(interval)*(1+jitterPct)) + 50*time.Millisecond
|
||
if i == 0 {
|
||
minGap = 0 // first tick can land arbitrarily fast under CI scheduling pressure
|
||
}
|
||
|
||
if gap < minGap || gap > maxGap {
|
||
t.Errorf("tick %d gap=%v outside envelope [%v, %v]", i, gap, minGap, maxGap)
|
||
}
|
||
case <-time.After(5 * interval):
|
||
t.Fatalf("tick %d timed out (>5×interval); JitteredTicker stuck", i)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestJitteredTicker_MeanCloseToInterval(t *testing.T) {
|
||
// Statistical pin: across many ticks the mean gap should be
|
||
// reasonably close to the nominal interval. Larger deviations
|
||
// indicate the jitter draw is biased (e.g. only producing
|
||
// positive deltas because of a sign bug — mean would drift to
|
||
// interval × 1.3 instead of staying near interval × 1.0).
|
||
//
|
||
// The 50ms interval + 50-tick sample is chosen so per-scheduler-
|
||
// quantum jitter (~1ms on Linux) is < 2% of the interval; the
|
||
// 30% bound below is generous enough for CI scheduling noise
|
||
// while still catching sign bugs (which would push mean drift
|
||
// past 30% trivially).
|
||
const (
|
||
interval = 50 * time.Millisecond
|
||
jitterPct = 0.30
|
||
ticks = 50
|
||
)
|
||
|
||
jt := NewJitteredTicker(interval, jitterPct)
|
||
defer jt.Stop()
|
||
|
||
gaps := make([]time.Duration, 0, ticks)
|
||
last := time.Now()
|
||
|
||
for i := 0; i < ticks; i++ {
|
||
select {
|
||
case now := <-jt.C:
|
||
if i > 0 { // skip first gap (goroutine warmup)
|
||
gaps = append(gaps, now.Sub(last))
|
||
}
|
||
last = now
|
||
case <-time.After(5 * interval):
|
||
t.Fatalf("tick %d timed out", i)
|
||
}
|
||
}
|
||
|
||
var sum time.Duration
|
||
for _, g := range gaps {
|
||
sum += g
|
||
}
|
||
mean := sum / time.Duration(len(gaps))
|
||
|
||
// Sign-bug threshold: a healthy jittered ticker should produce
|
||
// mean ≈ interval (mean drift < 10%). A sign bug (e.g.
|
||
// always-positive jitter) shifts mean to interval × (1 +
|
||
// jitterPct / 2) = +15%. 30% bound catches that while
|
||
// tolerating CI scheduling noise + the (1 - x) vs (1 + x)
|
||
// asymmetry of multiplicative jitter.
|
||
driftPct := math.Abs(float64(mean-interval)) / float64(interval)
|
||
if driftPct > 0.30 {
|
||
t.Errorf("mean gap %v drifts %.1f%% from nominal interval %v (>30%% threshold)", mean, driftPct*100, interval)
|
||
}
|
||
}
|
||
|
||
func TestJitteredTicker_Stop_ReleasesGoroutine(t *testing.T) {
|
||
jt := NewJitteredTicker(50*time.Millisecond, 0.10)
|
||
|
||
// Stop immediately, before any tick fires.
|
||
jt.Stop()
|
||
|
||
// C should close within one tick interval. If it doesn't, the
|
||
// goroutine is stuck (which would leak in production).
|
||
select {
|
||
case _, ok := <-jt.C:
|
||
if ok {
|
||
// A tick fired before C closed — also acceptable, but
|
||
// drain it and re-check that close follows.
|
||
select {
|
||
case _, ok2 := <-jt.C:
|
||
if ok2 {
|
||
t.Errorf("JitteredTicker.C still emitting after Stop()")
|
||
}
|
||
case <-time.After(200 * time.Millisecond):
|
||
t.Errorf("JitteredTicker.C did not close after Stop()")
|
||
}
|
||
}
|
||
case <-time.After(200 * time.Millisecond):
|
||
t.Errorf("JitteredTicker.C did not close within 200ms of Stop()")
|
||
}
|
||
}
|
||
|
||
func TestJitteredTicker_Stop_Idempotent(t *testing.T) {
|
||
jt := NewJitteredTicker(50*time.Millisecond, 0.10)
|
||
|
||
// Multiple Stop() calls must not panic.
|
||
jt.Stop()
|
||
jt.Stop()
|
||
jt.Stop()
|
||
}
|
||
|
||
func TestJitteredTicker_ZeroJitter_BehavesLikeTicker(t *testing.T) {
|
||
// jitterPct=0 reduces to a deterministic ticker. The mean
|
||
// should be exactly the interval (modulo scheduling noise).
|
||
const (
|
||
interval = 20 * time.Millisecond
|
||
ticks = 10
|
||
)
|
||
|
||
jt := NewJitteredTicker(interval, 0)
|
||
defer jt.Stop()
|
||
|
||
last := time.Now()
|
||
for i := 0; i < ticks; i++ {
|
||
select {
|
||
case now := <-jt.C:
|
||
gap := now.Sub(last)
|
||
last = now
|
||
// Allow generous slop for CI scheduling.
|
||
if i > 0 && (gap < interval/2 || gap > interval*3) {
|
||
t.Errorf("zero-jitter tick %d gap=%v far from interval=%v", i, gap, interval)
|
||
}
|
||
case <-time.After(5 * interval):
|
||
t.Fatalf("zero-jitter tick %d timed out", i)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestJitteredTicker_NegativeJitter_TreatedAsZero(t *testing.T) {
|
||
// Defensive: negative jitterPct should not produce
|
||
// negative-duration timers (which would panic time.NewTimer).
|
||
jt := NewJitteredTicker(10*time.Millisecond, -0.5)
|
||
defer jt.Stop()
|
||
|
||
// Just confirm at least one tick fires without panic.
|
||
select {
|
||
case <-jt.C:
|
||
// ok
|
||
case <-time.After(100 * time.Millisecond):
|
||
t.Errorf("negative-jitter ticker produced no tick within 100ms")
|
||
}
|
||
}
|
||
|
||
func TestJitteredTicker_LargeJitter_ClampedBelowOne(t *testing.T) {
|
||
// Defensive: jitterPct≥1 would otherwise allow next=0 and panic
|
||
// time.NewTimer. Confirm the ticker still fires.
|
||
jt := NewJitteredTicker(10*time.Millisecond, 1.5)
|
||
defer jt.Stop()
|
||
|
||
select {
|
||
case <-jt.C:
|
||
// ok
|
||
case <-time.After(100 * time.Millisecond):
|
||
t.Errorf("over-clamped-jitter ticker produced no tick within 100ms")
|
||
}
|
||
}
|