From 183c56f6c585ff63942815ab611f15fa87adc891 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Sat, 16 May 2026 04:01:59 +0000 Subject: [PATCH] =?UTF-8?q?fix(agent):=20SCALE-006=20=E2=80=94=20startup?= =?UTF-8?q?=20+=20recurring=20jitter=20on=20heartbeat=20and=20poll=20loops?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sprint 2 unified-master-audit closure. Pre-fix the agent started its heartbeat + poll loops on bare time.NewTicker cadence with no startup jitter: heartbeatTicker := time.NewTicker(a.heartbeatInterval) pollTicker := time.NewTicker(a.pollInterval) a.sendHeartbeat(ctx) // fires immediately, in lockstep a.pollForWork(ctx) // ditto A mass restart (rolling K8s deploy, control-plane reboot, scheduled fleet bounce) produced a thundering herd — 5K agents booting in a 10-second window all hit /heartbeat in lockstep, then /poll, every interval forever afterward. Fix: - Per-agent startup jitter ∈ [0, interval) drawn fresh from math/rand/v2 (no cryptographic strength needed) before the first heartbeat and first poll. Heartbeat and poll jitters are drawn independently so a single seed doesn't create a secondary correlation pattern. - time.NewTicker swapped for the existing in-tree internal/scheduler.JitteredTicker primitive (±10% per-tick envelope, fresh draw per tick to prevent drift compounding). Same pattern as every server-side scheduler.go loop. - Startup-jitter Sleeps are ctx-aware so a sigint-during-startup exits cleanly rather than hanging. The select cases that read heartbeatTicker.C / pollTicker.C are unchanged — JitteredTicker.C is a chan time.Time, identical shape to time.Ticker.C. Discovery ticker is left as bare time.NewTicker (audit didn't cite it; changing it would expand scope). Closes SCALE-006. --- cmd/agent/main.go | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/cmd/agent/main.go b/cmd/agent/main.go index efaa270..114b938 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -14,6 +14,7 @@ import ( "fmt" "io" "log/slog" + "math/rand/v2" "net" "net/http" "net/url" @@ -24,6 +25,8 @@ import ( "sync" "syscall" "time" + + "github.com/certctl-io/certctl/internal/scheduler" ) // AgentConfig represents the agent-side configuration. @@ -231,15 +234,49 @@ func (a *Agent) Run(ctx context.Context) error { a.logger.Warn("failed to enforce key directory permissions", "path", a.config.KeyDir, "error", err) } - // Create ticker channels for heartbeat, polling, and discovery - heartbeatTicker := time.NewTicker(a.heartbeatInterval) + // SCALE-006 closure (Sprint 2, 2026-05-16). Pre-fix the agent + // started its heartbeat + poll loops on fixed time.NewTicker + // cadence with an unjittered immediate first invocation. Mass + // restarts (rolling K8s deploy, control-plane reboot, scheduled + // fleet bounce) produced a thundering herd — 5K agents booting + // in a 10-second window all hit /heartbeat in lockstep, then + // /poll, every interval forever afterward. + // + // Fix: (1) sleep a random startup-jitter ∈ [0, interval) before + // the first heartbeat + first poll to spread the initial cohort, + // and (2) use scheduler.JitteredTicker (±10% per-tick envelope) + // for the recurring ticks so the cohort stays spread across + // every tick boundary. Both legs use the existing in-tree + // JitteredTicker primitive (internal/scheduler/jitter.go) — + // pattern already exercised by every scheduler.go loop on the + // server side. + heartbeatTicker := scheduler.NewJitteredTicker(a.heartbeatInterval, scheduler.DefaultSchedulerJitter) defer heartbeatTicker.Stop() - - pollTicker := time.NewTicker(a.pollInterval) + pollTicker := scheduler.NewJitteredTicker(a.pollInterval, scheduler.DefaultSchedulerJitter) defer pollTicker.Stop() - // Run initial heartbeat and poll + // Startup jitter — run-first delay drawn fresh per-agent so a + // 5K-agent rolling-restart spreads out across (max interval). + // Bounded by ctx so a sigint-during-startup exits cleanly rather + // than hanging on the Sleep. Heartbeat and poll are drawn + // independently so a single random seed doesn't create a + // secondary correlation pattern. + hbJitter := time.Duration(rand.Int64N(int64(a.heartbeatInterval))) + pollJitter := time.Duration(rand.Int64N(int64(a.pollInterval))) + a.logger.Info("startup jitter applied", + "heartbeat_jitter", hbJitter.String(), + "poll_jitter", pollJitter.String()) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(hbJitter): + } a.sendHeartbeat(ctx) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(pollJitter): + } a.pollForWork(ctx) // Discovery: run initial scan if directories configured, then on interval