mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 14:21:37 +00:00
fix(agent): SCALE-006 — startup + recurring jitter on heartbeat and poll loops
Sprint 2 unified-master-audit closure. Pre-fix the agent started
its heartbeat + poll loops on bare time.NewTicker cadence with no
startup jitter:
heartbeatTicker := time.NewTicker(a.heartbeatInterval)
pollTicker := time.NewTicker(a.pollInterval)
a.sendHeartbeat(ctx) // fires immediately, in lockstep
a.pollForWork(ctx) // ditto
A mass restart (rolling K8s deploy, control-plane reboot, scheduled
fleet bounce) produced a thundering herd — 5K agents booting in a
10-second window all hit /heartbeat in lockstep, then /poll, every
interval forever afterward.
Fix:
- Per-agent startup jitter ∈ [0, interval) drawn fresh from
math/rand/v2 (no cryptographic strength needed) before the first
heartbeat and first poll. Heartbeat and poll jitters are drawn
independently so a single seed doesn't create a secondary
correlation pattern.
- time.NewTicker swapped for the existing in-tree
internal/scheduler.JitteredTicker primitive (±10% per-tick
envelope, fresh draw per tick to prevent drift compounding).
Same pattern as every server-side scheduler.go loop.
- Startup-jitter Sleeps are ctx-aware so a sigint-during-startup
exits cleanly rather than hanging.
The select cases that read heartbeatTicker.C / pollTicker.C are
unchanged — JitteredTicker.C is a chan time.Time, identical shape
to time.Ticker.C.
Discovery ticker is left as bare time.NewTicker (audit didn't cite
it; changing it would expand scope).
Closes SCALE-006.
This commit is contained in:
+42
-5
@@ -14,6 +14,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"math/rand/v2"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
@@ -24,6 +25,8 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/scheduler"
|
||||||
)
|
)
|
||||||
|
|
||||||
// AgentConfig represents the agent-side configuration.
|
// AgentConfig represents the agent-side configuration.
|
||||||
@@ -231,15 +234,49 @@ func (a *Agent) Run(ctx context.Context) error {
|
|||||||
a.logger.Warn("failed to enforce key directory permissions", "path", a.config.KeyDir, "error", err)
|
a.logger.Warn("failed to enforce key directory permissions", "path", a.config.KeyDir, "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create ticker channels for heartbeat, polling, and discovery
|
// SCALE-006 closure (Sprint 2, 2026-05-16). Pre-fix the agent
|
||||||
heartbeatTicker := time.NewTicker(a.heartbeatInterval)
|
// started its heartbeat + poll loops on fixed time.NewTicker
|
||||||
|
// cadence with an unjittered immediate first invocation. Mass
|
||||||
|
// restarts (rolling K8s deploy, control-plane reboot, scheduled
|
||||||
|
// fleet bounce) produced a thundering herd — 5K agents booting
|
||||||
|
// in a 10-second window all hit /heartbeat in lockstep, then
|
||||||
|
// /poll, every interval forever afterward.
|
||||||
|
//
|
||||||
|
// Fix: (1) sleep a random startup-jitter ∈ [0, interval) before
|
||||||
|
// the first heartbeat + first poll to spread the initial cohort,
|
||||||
|
// and (2) use scheduler.JitteredTicker (±10% per-tick envelope)
|
||||||
|
// for the recurring ticks so the cohort stays spread across
|
||||||
|
// every tick boundary. Both legs use the existing in-tree
|
||||||
|
// JitteredTicker primitive (internal/scheduler/jitter.go) —
|
||||||
|
// pattern already exercised by every scheduler.go loop on the
|
||||||
|
// server side.
|
||||||
|
heartbeatTicker := scheduler.NewJitteredTicker(a.heartbeatInterval, scheduler.DefaultSchedulerJitter)
|
||||||
defer heartbeatTicker.Stop()
|
defer heartbeatTicker.Stop()
|
||||||
|
pollTicker := scheduler.NewJitteredTicker(a.pollInterval, scheduler.DefaultSchedulerJitter)
|
||||||
pollTicker := time.NewTicker(a.pollInterval)
|
|
||||||
defer pollTicker.Stop()
|
defer pollTicker.Stop()
|
||||||
|
|
||||||
// Run initial heartbeat and poll
|
// Startup jitter — run-first delay drawn fresh per-agent so a
|
||||||
|
// 5K-agent rolling-restart spreads out across (max interval).
|
||||||
|
// Bounded by ctx so a sigint-during-startup exits cleanly rather
|
||||||
|
// than hanging on the Sleep. Heartbeat and poll are drawn
|
||||||
|
// independently so a single random seed doesn't create a
|
||||||
|
// secondary correlation pattern.
|
||||||
|
hbJitter := time.Duration(rand.Int64N(int64(a.heartbeatInterval)))
|
||||||
|
pollJitter := time.Duration(rand.Int64N(int64(a.pollInterval)))
|
||||||
|
a.logger.Info("startup jitter applied",
|
||||||
|
"heartbeat_jitter", hbJitter.String(),
|
||||||
|
"poll_jitter", pollJitter.String())
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(hbJitter):
|
||||||
|
}
|
||||||
a.sendHeartbeat(ctx)
|
a.sendHeartbeat(ctx)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(pollJitter):
|
||||||
|
}
|
||||||
a.pollForWork(ctx)
|
a.pollForWork(ctx)
|
||||||
|
|
||||||
// Discovery: run initial scan if directories configured, then on interval
|
// Discovery: run initial scan if directories configured, then on interval
|
||||||
|
|||||||
Reference in New Issue
Block a user