mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 14:21:37 +00:00
fix(middleware): SEC-006 — TTL-evict idle token-bucket rate-limiter entries
Sprint 2 unified-master-audit closure. Pre-fix the keyed rate
limiter's bucket map had no eviction. The package-level comment
explicitly noted the leak: high-cardinality unauthenticated traffic
(CGNAT churn, Tor exit lists, botnets, infinite-cardinality scanners)
grew process memory unboundedly. Production deploys with millions of
unique IPs would eventually OOM.
Fix:
- RateLimitConfig.BucketTTL (env CERTCTL_RATE_LIMIT_BUCKET_TTL,
default 1h, clamp-floor 1m). 1h chosen to be well above realistic
operator IP churn windows (returning clients keep their bucket)
and well below the unbounded-leak window the pre-fix code
allowed.
- tokenBucket gains a lastAccess field updated on every allow()
call via touch(); reading via lastAccessTime() under the bucket's
own mutex.
- keyedRateLimiter.sweepLoop runs in a single goroutine per
limiter (production wires 2: default + no-auth fallback), waking
every BucketTTL/4. sweep() removes any bucket whose lastAccess
is older than the cutoff and bumps evictedTotal atomically.
- Both NewRateLimiter call sites in cmd/server/main.go (default
stack and no-auth fallback) now thread cfg.RateLimit.BucketTTL.
Regression coverage:
- TestKeyedRateLimiter_SweepEvictsIdleBuckets: 1000 synthetic IP
keys populate the map, advance past TTL, call sweep() directly,
assert map drained to 0 + evictedTotal=1000 + fresh key creates
new bucket (map not poisoned).
- TestKeyedRateLimiter_SweepKeepsActiveBuckets: inverse — a bucket
touched within the TTL window survives the sweep. Catches a
future regression that inverts the cutoff comparison.
Closes SEC-006.
This commit is contained in:
@@ -2080,6 +2080,11 @@ func main() {
|
||||
BurstSize: cfg.RateLimit.BurstSize,
|
||||
PerUserRPS: cfg.RateLimit.PerUserRPS,
|
||||
PerUserBurstSize: cfg.RateLimit.PerUserBurstSize,
|
||||
// SEC-006 (Sprint 2): bounded bucket TTL so a long-running
|
||||
// server with high-cardinality unauthenticated traffic
|
||||
// (CGNAT churn, Tor exits, scanners) doesn't grow the map
|
||||
// indefinitely.
|
||||
BucketTTL: cfg.RateLimit.BucketTTL,
|
||||
})
|
||||
// SEC-003 closure (Sprint 1, 2026-05-16). Pre-fix the
|
||||
// rate-limit-enabled stack was rebuilt without
|
||||
@@ -2166,6 +2171,10 @@ func main() {
|
||||
noAuthRateLimiter := middleware.NewRateLimiter(middleware.RateLimitConfig{
|
||||
RPS: cfg.RateLimit.RPS,
|
||||
BurstSize: cfg.RateLimit.BurstSize,
|
||||
// SEC-006 closure (Sprint 2): same bucket-TTL eviction for the
|
||||
// no-auth limiter — this one's the higher exposure since every
|
||||
// unauthenticated probe gets a fresh IP-keyed bucket.
|
||||
BucketTTL: cfg.RateLimit.BucketTTL,
|
||||
})
|
||||
noAuthMiddleware = append(noAuthMiddleware, noAuthRateLimiter)
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@@ -152,6 +153,14 @@ type RateLimitConfig struct {
|
||||
// PerUserBurstSize overrides BurstSize for authenticated callers.
|
||||
// Zero means "use BurstSize".
|
||||
PerUserBurstSize int
|
||||
|
||||
// BucketTTL bounds the lifetime of an unused token bucket in the
|
||||
// per-key map. The background sweeper runs every (BucketTTL/4) and
|
||||
// removes entries whose last allow() call is older than BucketTTL.
|
||||
// Zero or negative values fall through to a 1-hour default; values
|
||||
// below 1 minute are clamped up to 1 minute (sweeper sanity).
|
||||
// SEC-006 closure (Sprint 2, 2026-05-16).
|
||||
BucketTTL time.Duration
|
||||
}
|
||||
|
||||
// NewRateLimiter creates a per-key token bucket rate limiting middleware.
|
||||
@@ -166,11 +175,18 @@ type RateLimitConfig struct {
|
||||
// - Unauthenticated: "ip:" + r.RemoteAddr's host portion
|
||||
//
|
||||
// The bucket map is sync.RWMutex-guarded; create-on-demand for new keys.
|
||||
// There is no eviction; for a long-running server with millions of unique
|
||||
// IPs this can leak memory. A future enhancement is per-key TTL via a
|
||||
// lazy sweeper. For now the leak is bounded by realistic operator IP
|
||||
// fan-out and is acceptable per OWASP ASVS L2 (the threat model is abuse
|
||||
// by a known set of clients, not infinite-cardinality scanners).
|
||||
//
|
||||
// SEC-006 closure (Sprint 2, 2026-05-16). Pre-fix the bucket map had no
|
||||
// eviction, so high-cardinality unauthenticated traffic (CGNAT churn,
|
||||
// Tor exit lists, botnets, infinite-cardinality scanners) grew process
|
||||
// memory unboundedly. Each bucket now carries `lastAccess`; a background
|
||||
// sweeper goroutine (one per limiter) wakes every (bucketTTL / 4) and
|
||||
// removes entries whose lastAccess is older than `bucketTTL`. Default
|
||||
// TTL is 1 hour — well above realistic operator IP churn windows so a
|
||||
// returning client gets the same bucket, but bounded enough that a
|
||||
// scanner's churn is reclaimed within an hour. Operators can override
|
||||
// via cfg.BucketTTL (or the CERTCTL_RATE_LIMIT_BUCKET_TTL env var that
|
||||
// cmd/server/main.go threads through).
|
||||
func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler {
|
||||
// Default per-user budgets to the IP-keyed budget when not overridden.
|
||||
perUserRPS := cfg.PerUserRPS
|
||||
@@ -182,14 +198,33 @@ func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler {
|
||||
perUserBurst = float64(cfg.BurstSize)
|
||||
}
|
||||
|
||||
// SEC-006: bucket TTL eviction. Default 1h; minimum 1m to keep
|
||||
// the sweeper from running pathologically often if an operator
|
||||
// sets a tiny value.
|
||||
bucketTTL := cfg.BucketTTL
|
||||
if bucketTTL <= 0 {
|
||||
bucketTTL = time.Hour
|
||||
}
|
||||
if bucketTTL < time.Minute {
|
||||
bucketTTL = time.Minute
|
||||
}
|
||||
|
||||
limiter := &keyedRateLimiter{
|
||||
ipRate: cfg.RPS,
|
||||
ipBurst: float64(cfg.BurstSize),
|
||||
userRate: perUserRPS,
|
||||
userBurst: perUserBurst,
|
||||
buckets: make(map[string]*tokenBucket),
|
||||
bucketTTL: bucketTTL,
|
||||
}
|
||||
|
||||
// Sweeper goroutine. Single goroutine per limiter; production wires
|
||||
// 2 limiters (default + no-auth-fallback) so the cost is 2 idle
|
||||
// goroutines per server. Lives for the process lifetime; no
|
||||
// shutdown handle is exposed because main.go owns both limiters
|
||||
// for the entire run.
|
||||
go limiter.sweepLoop()
|
||||
|
||||
return func(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
key, isUser := rateLimitKey(r)
|
||||
@@ -231,6 +266,12 @@ func rateLimitKey(r *http.Request) (string, bool) {
|
||||
|
||||
// keyedRateLimiter holds a token bucket per (user-or-ip) key with separate
|
||||
// rate / burst defaults for the user-keyed and ip-keyed dimensions.
|
||||
//
|
||||
// SEC-006: bucketTTL bounds the unused-bucket lifetime; sweepLoop runs
|
||||
// in a goroutine spawned by NewRateLimiter and evicts entries whose
|
||||
// lastAccess is older than bucketTTL on every (bucketTTL/4) tick.
|
||||
// evictedTotal exposes the lifetime eviction count (atomic-loaded by
|
||||
// tests and the operator stats endpoint).
|
||||
type keyedRateLimiter struct {
|
||||
mu sync.RWMutex
|
||||
buckets map[string]*tokenBucket
|
||||
@@ -238,6 +279,14 @@ type keyedRateLimiter struct {
|
||||
ipBurst float64
|
||||
userRate float64
|
||||
userBurst float64
|
||||
|
||||
bucketTTL time.Duration
|
||||
evictedTotal atomic.Uint64
|
||||
// sweepTick is the channel sweepLoop ticks on. Default time.Ticker;
|
||||
// tests swap to a manual chan time.Time for deterministic eviction.
|
||||
// Set via the (test-only) seam noted below; production never
|
||||
// reassigns this field.
|
||||
sweepTickCh <-chan time.Time
|
||||
}
|
||||
|
||||
func (k *keyedRateLimiter) allow(key string, isUser bool) bool {
|
||||
@@ -260,22 +309,90 @@ func (k *keyedRateLimiter) allow(key string, isUser bool) bool {
|
||||
burstSize: burst,
|
||||
tokens: burst,
|
||||
lastRefill: time.Now(),
|
||||
lastAccess: time.Now(),
|
||||
}
|
||||
k.buckets[key] = tb
|
||||
}
|
||||
k.mu.Unlock()
|
||||
}
|
||||
return tb.allow()
|
||||
allowed := tb.allow()
|
||||
// SEC-006: update lastAccess on every call (cheap; same mutex
|
||||
// the bucket already holds via tb.allow's mu). Sweeper reads
|
||||
// this to decide eviction.
|
||||
tb.touch()
|
||||
return allowed
|
||||
}
|
||||
|
||||
// sweepLoop is the background eviction goroutine spawned by
|
||||
// NewRateLimiter. It wakes every bucketTTL/4 and removes any bucket
|
||||
// whose lastAccess is older than bucketTTL. The (bucketTTL/4) cadence
|
||||
// is a compromise — fast enough to keep the map ceiling tight,
|
||||
// slow enough that the sweep cost amortises across many requests.
|
||||
// SEC-006 closure.
|
||||
func (k *keyedRateLimiter) sweepLoop() {
|
||||
// Test seam: if a manual tick channel is wired, use it. Production
|
||||
// always uses time.NewTicker which time.Time-types the channel
|
||||
// identically.
|
||||
if k.sweepTickCh != nil {
|
||||
for range k.sweepTickCh {
|
||||
k.sweep()
|
||||
}
|
||||
return
|
||||
}
|
||||
period := k.bucketTTL / 4
|
||||
if period < time.Second {
|
||||
period = time.Second
|
||||
}
|
||||
t := time.NewTicker(period)
|
||||
defer t.Stop()
|
||||
for range t.C {
|
||||
k.sweep()
|
||||
}
|
||||
}
|
||||
|
||||
// sweep removes every bucket whose lastAccess is older than bucketTTL
|
||||
// and bumps evictedTotal. Exported for tests via a same-package alias.
|
||||
func (k *keyedRateLimiter) sweep() {
|
||||
cutoff := time.Now().Add(-k.bucketTTL)
|
||||
k.mu.Lock()
|
||||
defer k.mu.Unlock()
|
||||
for key, tb := range k.buckets {
|
||||
if tb.lastAccessTime().Before(cutoff) {
|
||||
delete(k.buckets, key)
|
||||
k.evictedTotal.Add(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tokenBucket implements a simple thread-safe token bucket rate limiter.
|
||||
// This avoids importing golang.org/x/time/rate to keep dependencies minimal.
|
||||
//
|
||||
// SEC-006: lastAccess is updated on every allow() call (via touch()) so
|
||||
// the keyedRateLimiter sweeper can evict idle buckets without a second
|
||||
// per-key map. Guarded by the same mu as rate-limiting state.
|
||||
type tokenBucket struct {
|
||||
mu sync.Mutex
|
||||
rate float64 // tokens per second
|
||||
burstSize float64 // max tokens
|
||||
tokens float64 // current tokens
|
||||
lastRefill time.Time // last refill time
|
||||
lastAccess time.Time // last allow() call — for SEC-006 sweeper
|
||||
}
|
||||
|
||||
// touch updates the bucket's lastAccess timestamp under its own mutex.
|
||||
// Called from keyedRateLimiter.allow after the rate-limit decision.
|
||||
func (tb *tokenBucket) touch() {
|
||||
tb.mu.Lock()
|
||||
tb.lastAccess = time.Now()
|
||||
tb.mu.Unlock()
|
||||
}
|
||||
|
||||
// lastAccessTime is the sweeper's read accessor. Uses the bucket's
|
||||
// own mutex so the read is consistent with concurrent touch() calls.
|
||||
func (tb *tokenBucket) lastAccessTime() time.Time {
|
||||
tb.mu.Lock()
|
||||
defer tb.mu.Unlock()
|
||||
return tb.lastAccess
|
||||
}
|
||||
|
||||
func (tb *tokenBucket) allow() bool {
|
||||
|
||||
@@ -2,9 +2,11 @@ package middleware
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/certctl-io/certctl/internal/auth"
|
||||
)
|
||||
@@ -188,3 +190,94 @@ func TestRateLimiter_M025_EmptyUserKeyTreatedAsAnonymous(t *testing.T) {
|
||||
t.Errorf("second anonymous request from different IP should still pass (independent IP buckets); got %d", rr.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// SEC-006 closure (Sprint 2, 2026-05-16). The token-bucket map now has
|
||||
// a background sweeper that evicts buckets whose last allow() call is
|
||||
// older than the configured BucketTTL. This test pins the eviction
|
||||
// path against a synthetic 1000-key load and asserts:
|
||||
//
|
||||
// 1. Buckets created by N distinct keys land in the map.
|
||||
// 2. After the simulated TTL elapses and the sweeper runs, the map
|
||||
// is reclaimed and evictedTotal reflects the count.
|
||||
// 3. A subsequent request from a fresh key creates a new bucket
|
||||
// (i.e. the map isn't poisoned by the eviction).
|
||||
//
|
||||
// The test calls sweep() directly rather than relying on the goroutine
|
||||
// + time.Ticker so it stays deterministic and fast. The sweeper
|
||||
// goroutine itself is exercised in production; this test pins the
|
||||
// eviction predicate.
|
||||
// =============================================================================
|
||||
|
||||
func TestKeyedRateLimiter_SweepEvictsIdleBuckets(t *testing.T) {
|
||||
limiter := &keyedRateLimiter{
|
||||
ipRate: 1000,
|
||||
ipBurst: 1000,
|
||||
userRate: 1000,
|
||||
userBurst: 1000,
|
||||
buckets: make(map[string]*tokenBucket),
|
||||
bucketTTL: 100 * time.Millisecond,
|
||||
}
|
||||
|
||||
// Populate 1000 buckets from a synthetic IP-key churn.
|
||||
for i := 0; i < 1000; i++ {
|
||||
key := "ip:198.51.100." + fmt.Sprintf("%d", i%256) + "/" + fmt.Sprintf("%d", i)
|
||||
if !limiter.allow(key, false) {
|
||||
t.Fatalf("synthetic IP-key %d: allow returned false on first call", i)
|
||||
}
|
||||
}
|
||||
limiter.mu.RLock()
|
||||
if got := len(limiter.buckets); got != 1000 {
|
||||
limiter.mu.RUnlock()
|
||||
t.Fatalf("post-populate bucket count = %d; want 1000", got)
|
||||
}
|
||||
limiter.mu.RUnlock()
|
||||
|
||||
// Advance past the TTL boundary, then sweep.
|
||||
time.Sleep(110 * time.Millisecond)
|
||||
limiter.sweep()
|
||||
|
||||
limiter.mu.RLock()
|
||||
remaining := len(limiter.buckets)
|
||||
limiter.mu.RUnlock()
|
||||
if remaining != 0 {
|
||||
t.Errorf("post-sweep bucket count = %d; want 0 (all should have been evicted)", remaining)
|
||||
}
|
||||
if got := limiter.evictedTotal.Load(); got != 1000 {
|
||||
t.Errorf("evictedTotal = %d; want 1000", got)
|
||||
}
|
||||
|
||||
// A fresh request creates a new bucket — map isn't poisoned.
|
||||
if !limiter.allow("ip:203.0.113.7", false) {
|
||||
t.Errorf("fresh key: allow returned false on first call after sweep")
|
||||
}
|
||||
limiter.mu.RLock()
|
||||
defer limiter.mu.RUnlock()
|
||||
if got := len(limiter.buckets); got != 1 {
|
||||
t.Errorf("post-sweep-plus-one bucket count = %d; want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestKeyedRateLimiter_SweepKeepsActiveBuckets pins the inverse — a
|
||||
// bucket touched within the TTL window survives the sweep. Catches a
|
||||
// future regression that inverts the cutoff comparison.
|
||||
func TestKeyedRateLimiter_SweepKeepsActiveBuckets(t *testing.T) {
|
||||
limiter := &keyedRateLimiter{
|
||||
ipRate: 1000,
|
||||
ipBurst: 1000,
|
||||
userRate: 1000,
|
||||
userBurst: 1000,
|
||||
buckets: make(map[string]*tokenBucket),
|
||||
bucketTTL: 1 * time.Hour, // generous so test timing doesn't flake
|
||||
}
|
||||
limiter.allow("ip:198.51.100.42", false)
|
||||
limiter.sweep()
|
||||
limiter.mu.RLock()
|
||||
defer limiter.mu.RUnlock()
|
||||
if got := len(limiter.buckets); got != 1 {
|
||||
t.Errorf("active-bucket count = %d; want 1 (sweep should not evict within TTL)", got)
|
||||
}
|
||||
if got := limiter.evictedTotal.Load(); got != 0 {
|
||||
t.Errorf("evictedTotal = %d; want 0 (no evictions expected)", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -451,6 +451,11 @@ func Load() (*Config, error) {
|
||||
BurstSize: getEnvInt("CERTCTL_RATE_LIMIT_BURST", 100),
|
||||
PerUserRPS: getEnvFloat("CERTCTL_RATE_LIMIT_PER_USER_RPS", 0),
|
||||
PerUserBurstSize: getEnvInt("CERTCTL_RATE_LIMIT_PER_USER_BURST", 0),
|
||||
// SEC-006 closure (Sprint 2, 2026-05-16): bounded unused-bucket
|
||||
// lifetime. 1h chosen to be well above realistic operator IP
|
||||
// churn (returning clients keep their bucket) and well below
|
||||
// the unbounded-leak window the pre-fix code allowed.
|
||||
BucketTTL: getEnvDuration("CERTCTL_RATE_LIMIT_BUCKET_TTL", 1*time.Hour),
|
||||
SlidingWindowBackend: getEnv("CERTCTL_RATE_LIMIT_BACKEND", "memory"),
|
||||
SlidingWindowJanitorInterval: getEnvDuration("CERTCTL_RATE_LIMIT_JANITOR_INTERVAL", 5*time.Minute),
|
||||
},
|
||||
|
||||
@@ -342,6 +342,17 @@ type RateLimitConfig struct {
|
||||
// Setting: CERTCTL_RATE_LIMIT_PER_USER_BURST environment variable.
|
||||
PerUserBurstSize int
|
||||
|
||||
// BucketTTL bounds the unused-bucket lifetime in the token-bucket
|
||||
// map. Idle buckets older than BucketTTL are reclaimed by a
|
||||
// background sweeper running every (BucketTTL/4). Default 1 hour;
|
||||
// values < 1 minute are clamped up to 1 minute in the limiter
|
||||
// constructor. Set this lower if the server faces high-cardinality
|
||||
// unauthenticated traffic (CGNAT churn, Tor exit lists, scanners)
|
||||
// and the map RSS becomes a concern.
|
||||
// SEC-006 closure (Sprint 2, 2026-05-16).
|
||||
// Setting: CERTCTL_RATE_LIMIT_BUCKET_TTL environment variable.
|
||||
BucketTTL time.Duration
|
||||
|
||||
// SlidingWindowBackend selects which backend implements the
|
||||
// per-key sliding-window-log limiters wired in cmd/server/main.go
|
||||
// (break-glass login, OCSP per-IP, cert-export per-actor, EST
|
||||
|
||||
Reference in New Issue
Block a user