fix(middleware): SEC-006 — TTL-evict idle token-bucket rate-limiter entries

Sprint 2 unified-master-audit closure. Pre-fix the keyed rate
limiter's bucket map had no eviction. The package-level comment
explicitly noted the leak: high-cardinality unauthenticated traffic
(CGNAT churn, Tor exit lists, botnets, infinite-cardinality scanners)
grew process memory unboundedly. Production deploys with millions of
unique IPs would eventually OOM.

Fix:
  - RateLimitConfig.BucketTTL (env CERTCTL_RATE_LIMIT_BUCKET_TTL,
    default 1h, clamp-floor 1m). 1h chosen to be well above realistic
    operator IP churn windows (returning clients keep their bucket)
    and well below the unbounded-leak window the pre-fix code
    allowed.
  - tokenBucket gains a lastAccess field updated on every allow()
    call via touch(); reading via lastAccessTime() under the bucket's
    own mutex.
  - keyedRateLimiter.sweepLoop runs in a single goroutine per
    limiter (production wires 2: default + no-auth fallback), waking
    every BucketTTL/4. sweep() removes any bucket whose lastAccess
    is older than the cutoff and bumps evictedTotal atomically.
  - Both NewRateLimiter call sites in cmd/server/main.go (default
    stack and no-auth fallback) now thread cfg.RateLimit.BucketTTL.

Regression coverage:
  - TestKeyedRateLimiter_SweepEvictsIdleBuckets: 1000 synthetic IP
    keys populate the map, advance past TTL, call sweep() directly,
    assert map drained to 0 + evictedTotal=1000 + fresh key creates
    new bucket (map not poisoned).
  - TestKeyedRateLimiter_SweepKeepsActiveBuckets: inverse — a bucket
    touched within the TTL window survives the sweep. Catches a
    future regression that inverts the cutoff comparison.

Closes SEC-006.
This commit is contained in:
shankar0123
2026-05-16 04:01:18 +00:00
parent 037876fa0f
commit 8f2e5771db
5 changed files with 246 additions and 11 deletions
+9
View File
@@ -2080,6 +2080,11 @@ func main() {
BurstSize: cfg.RateLimit.BurstSize,
PerUserRPS: cfg.RateLimit.PerUserRPS,
PerUserBurstSize: cfg.RateLimit.PerUserBurstSize,
// SEC-006 (Sprint 2): bounded bucket TTL so a long-running
// server with high-cardinality unauthenticated traffic
// (CGNAT churn, Tor exits, scanners) doesn't grow the map
// indefinitely.
BucketTTL: cfg.RateLimit.BucketTTL,
})
// SEC-003 closure (Sprint 1, 2026-05-16). Pre-fix the
// rate-limit-enabled stack was rebuilt without
@@ -2166,6 +2171,10 @@ func main() {
noAuthRateLimiter := middleware.NewRateLimiter(middleware.RateLimitConfig{
RPS: cfg.RateLimit.RPS,
BurstSize: cfg.RateLimit.BurstSize,
// SEC-006 closure (Sprint 2): same bucket-TTL eviction for the
// no-auth limiter — this one's the higher exposure since every
// unauthenticated probe gets a fresh IP-keyed bucket.
BucketTTL: cfg.RateLimit.BucketTTL,
})
noAuthMiddleware = append(noAuthMiddleware, noAuthRateLimiter)
}
+123 -6
View File
@@ -11,6 +11,7 @@ import (
"net/http"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
@@ -152,6 +153,14 @@ type RateLimitConfig struct {
// PerUserBurstSize overrides BurstSize for authenticated callers.
// Zero means "use BurstSize".
PerUserBurstSize int
// BucketTTL bounds the lifetime of an unused token bucket in the
// per-key map. The background sweeper runs every (BucketTTL/4) and
// removes entries whose last allow() call is older than BucketTTL.
// Zero or negative values fall through to a 1-hour default; values
// below 1 minute are clamped up to 1 minute (sweeper sanity).
// SEC-006 closure (Sprint 2, 2026-05-16).
BucketTTL time.Duration
}
// NewRateLimiter creates a per-key token bucket rate limiting middleware.
@@ -166,11 +175,18 @@ type RateLimitConfig struct {
// - Unauthenticated: "ip:" + r.RemoteAddr's host portion
//
// The bucket map is sync.RWMutex-guarded; create-on-demand for new keys.
// There is no eviction; for a long-running server with millions of unique
// IPs this can leak memory. A future enhancement is per-key TTL via a
// lazy sweeper. For now the leak is bounded by realistic operator IP
// fan-out and is acceptable per OWASP ASVS L2 (the threat model is abuse
// by a known set of clients, not infinite-cardinality scanners).
//
// SEC-006 closure (Sprint 2, 2026-05-16). Pre-fix the bucket map had no
// eviction, so high-cardinality unauthenticated traffic (CGNAT churn,
// Tor exit lists, botnets, infinite-cardinality scanners) grew process
// memory unboundedly. Each bucket now carries `lastAccess`; a background
// sweeper goroutine (one per limiter) wakes every (bucketTTL / 4) and
// removes entries whose lastAccess is older than `bucketTTL`. Default
// TTL is 1 hour — well above realistic operator IP churn windows so a
// returning client gets the same bucket, but bounded enough that a
// scanner's churn is reclaimed within an hour. Operators can override
// via cfg.BucketTTL (or the CERTCTL_RATE_LIMIT_BUCKET_TTL env var that
// cmd/server/main.go threads through).
func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler {
// Default per-user budgets to the IP-keyed budget when not overridden.
perUserRPS := cfg.PerUserRPS
@@ -182,14 +198,33 @@ func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler {
perUserBurst = float64(cfg.BurstSize)
}
// SEC-006: bucket TTL eviction. Default 1h; minimum 1m to keep
// the sweeper from running pathologically often if an operator
// sets a tiny value.
bucketTTL := cfg.BucketTTL
if bucketTTL <= 0 {
bucketTTL = time.Hour
}
if bucketTTL < time.Minute {
bucketTTL = time.Minute
}
limiter := &keyedRateLimiter{
ipRate: cfg.RPS,
ipBurst: float64(cfg.BurstSize),
userRate: perUserRPS,
userBurst: perUserBurst,
buckets: make(map[string]*tokenBucket),
bucketTTL: bucketTTL,
}
// Sweeper goroutine. Single goroutine per limiter; production wires
// 2 limiters (default + no-auth-fallback) so the cost is 2 idle
// goroutines per server. Lives for the process lifetime; no
// shutdown handle is exposed because main.go owns both limiters
// for the entire run.
go limiter.sweepLoop()
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
key, isUser := rateLimitKey(r)
@@ -231,6 +266,12 @@ func rateLimitKey(r *http.Request) (string, bool) {
// keyedRateLimiter holds a token bucket per (user-or-ip) key with separate
// rate / burst defaults for the user-keyed and ip-keyed dimensions.
//
// SEC-006: bucketTTL bounds the unused-bucket lifetime; sweepLoop runs
// in a goroutine spawned by NewRateLimiter and evicts entries whose
// lastAccess is older than bucketTTL on every (bucketTTL/4) tick.
// evictedTotal exposes the lifetime eviction count (atomic-loaded by
// tests and the operator stats endpoint).
type keyedRateLimiter struct {
mu sync.RWMutex
buckets map[string]*tokenBucket
@@ -238,6 +279,14 @@ type keyedRateLimiter struct {
ipBurst float64
userRate float64
userBurst float64
bucketTTL time.Duration
evictedTotal atomic.Uint64
// sweepTick is the channel sweepLoop ticks on. Default time.Ticker;
// tests swap to a manual chan time.Time for deterministic eviction.
// Set via the (test-only) seam noted below; production never
// reassigns this field.
sweepTickCh <-chan time.Time
}
func (k *keyedRateLimiter) allow(key string, isUser bool) bool {
@@ -260,22 +309,90 @@ func (k *keyedRateLimiter) allow(key string, isUser bool) bool {
burstSize: burst,
tokens: burst,
lastRefill: time.Now(),
lastAccess: time.Now(),
}
k.buckets[key] = tb
}
k.mu.Unlock()
}
return tb.allow()
allowed := tb.allow()
// SEC-006: update lastAccess on every call (cheap; same mutex
// the bucket already holds via tb.allow's mu). Sweeper reads
// this to decide eviction.
tb.touch()
return allowed
}
// sweepLoop is the background eviction goroutine spawned by
// NewRateLimiter. It wakes every bucketTTL/4 and removes any bucket
// whose lastAccess is older than bucketTTL. The (bucketTTL/4) cadence
// is a compromise — fast enough to keep the map ceiling tight,
// slow enough that the sweep cost amortises across many requests.
// SEC-006 closure.
func (k *keyedRateLimiter) sweepLoop() {
// Test seam: if a manual tick channel is wired, use it. Production
// always uses time.NewTicker which time.Time-types the channel
// identically.
if k.sweepTickCh != nil {
for range k.sweepTickCh {
k.sweep()
}
return
}
period := k.bucketTTL / 4
if period < time.Second {
period = time.Second
}
t := time.NewTicker(period)
defer t.Stop()
for range t.C {
k.sweep()
}
}
// sweep removes every bucket whose lastAccess is older than bucketTTL
// and bumps evictedTotal. Exported for tests via a same-package alias.
func (k *keyedRateLimiter) sweep() {
cutoff := time.Now().Add(-k.bucketTTL)
k.mu.Lock()
defer k.mu.Unlock()
for key, tb := range k.buckets {
if tb.lastAccessTime().Before(cutoff) {
delete(k.buckets, key)
k.evictedTotal.Add(1)
}
}
}
// tokenBucket implements a simple thread-safe token bucket rate limiter.
// This avoids importing golang.org/x/time/rate to keep dependencies minimal.
//
// SEC-006: lastAccess is updated on every allow() call (via touch()) so
// the keyedRateLimiter sweeper can evict idle buckets without a second
// per-key map. Guarded by the same mu as rate-limiting state.
type tokenBucket struct {
mu sync.Mutex
rate float64 // tokens per second
burstSize float64 // max tokens
tokens float64 // current tokens
lastRefill time.Time // last refill time
lastAccess time.Time // last allow() call — for SEC-006 sweeper
}
// touch updates the bucket's lastAccess timestamp under its own mutex.
// Called from keyedRateLimiter.allow after the rate-limit decision.
func (tb *tokenBucket) touch() {
tb.mu.Lock()
tb.lastAccess = time.Now()
tb.mu.Unlock()
}
// lastAccessTime is the sweeper's read accessor. Uses the bucket's
// own mutex so the read is consistent with concurrent touch() calls.
func (tb *tokenBucket) lastAccessTime() time.Time {
tb.mu.Lock()
defer tb.mu.Unlock()
return tb.lastAccess
}
func (tb *tokenBucket) allow() bool {
@@ -2,9 +2,11 @@ package middleware
import (
"context"
"fmt"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/certctl-io/certctl/internal/auth"
)
@@ -188,3 +190,94 @@ func TestRateLimiter_M025_EmptyUserKeyTreatedAsAnonymous(t *testing.T) {
t.Errorf("second anonymous request from different IP should still pass (independent IP buckets); got %d", rr.Code)
}
}
// =============================================================================
// SEC-006 closure (Sprint 2, 2026-05-16). The token-bucket map now has
// a background sweeper that evicts buckets whose last allow() call is
// older than the configured BucketTTL. This test pins the eviction
// path against a synthetic 1000-key load and asserts:
//
// 1. Buckets created by N distinct keys land in the map.
// 2. After the simulated TTL elapses and the sweeper runs, the map
// is reclaimed and evictedTotal reflects the count.
// 3. A subsequent request from a fresh key creates a new bucket
// (i.e. the map isn't poisoned by the eviction).
//
// The test calls sweep() directly rather than relying on the goroutine
// + time.Ticker so it stays deterministic and fast. The sweeper
// goroutine itself is exercised in production; this test pins the
// eviction predicate.
// =============================================================================
func TestKeyedRateLimiter_SweepEvictsIdleBuckets(t *testing.T) {
limiter := &keyedRateLimiter{
ipRate: 1000,
ipBurst: 1000,
userRate: 1000,
userBurst: 1000,
buckets: make(map[string]*tokenBucket),
bucketTTL: 100 * time.Millisecond,
}
// Populate 1000 buckets from a synthetic IP-key churn.
for i := 0; i < 1000; i++ {
key := "ip:198.51.100." + fmt.Sprintf("%d", i%256) + "/" + fmt.Sprintf("%d", i)
if !limiter.allow(key, false) {
t.Fatalf("synthetic IP-key %d: allow returned false on first call", i)
}
}
limiter.mu.RLock()
if got := len(limiter.buckets); got != 1000 {
limiter.mu.RUnlock()
t.Fatalf("post-populate bucket count = %d; want 1000", got)
}
limiter.mu.RUnlock()
// Advance past the TTL boundary, then sweep.
time.Sleep(110 * time.Millisecond)
limiter.sweep()
limiter.mu.RLock()
remaining := len(limiter.buckets)
limiter.mu.RUnlock()
if remaining != 0 {
t.Errorf("post-sweep bucket count = %d; want 0 (all should have been evicted)", remaining)
}
if got := limiter.evictedTotal.Load(); got != 1000 {
t.Errorf("evictedTotal = %d; want 1000", got)
}
// A fresh request creates a new bucket — map isn't poisoned.
if !limiter.allow("ip:203.0.113.7", false) {
t.Errorf("fresh key: allow returned false on first call after sweep")
}
limiter.mu.RLock()
defer limiter.mu.RUnlock()
if got := len(limiter.buckets); got != 1 {
t.Errorf("post-sweep-plus-one bucket count = %d; want 1", got)
}
}
// TestKeyedRateLimiter_SweepKeepsActiveBuckets pins the inverse — a
// bucket touched within the TTL window survives the sweep. Catches a
// future regression that inverts the cutoff comparison.
func TestKeyedRateLimiter_SweepKeepsActiveBuckets(t *testing.T) {
limiter := &keyedRateLimiter{
ipRate: 1000,
ipBurst: 1000,
userRate: 1000,
userBurst: 1000,
buckets: make(map[string]*tokenBucket),
bucketTTL: 1 * time.Hour, // generous so test timing doesn't flake
}
limiter.allow("ip:198.51.100.42", false)
limiter.sweep()
limiter.mu.RLock()
defer limiter.mu.RUnlock()
if got := len(limiter.buckets); got != 1 {
t.Errorf("active-bucket count = %d; want 1 (sweep should not evict within TTL)", got)
}
if got := limiter.evictedTotal.Load(); got != 0 {
t.Errorf("evictedTotal = %d; want 0 (no evictions expected)", got)
}
}
+5
View File
@@ -451,6 +451,11 @@ func Load() (*Config, error) {
BurstSize: getEnvInt("CERTCTL_RATE_LIMIT_BURST", 100),
PerUserRPS: getEnvFloat("CERTCTL_RATE_LIMIT_PER_USER_RPS", 0),
PerUserBurstSize: getEnvInt("CERTCTL_RATE_LIMIT_PER_USER_BURST", 0),
// SEC-006 closure (Sprint 2, 2026-05-16): bounded unused-bucket
// lifetime. 1h chosen to be well above realistic operator IP
// churn (returning clients keep their bucket) and well below
// the unbounded-leak window the pre-fix code allowed.
BucketTTL: getEnvDuration("CERTCTL_RATE_LIMIT_BUCKET_TTL", 1*time.Hour),
SlidingWindowBackend: getEnv("CERTCTL_RATE_LIMIT_BACKEND", "memory"),
SlidingWindowJanitorInterval: getEnvDuration("CERTCTL_RATE_LIMIT_JANITOR_INTERVAL", 5*time.Minute),
},
+11
View File
@@ -342,6 +342,17 @@ type RateLimitConfig struct {
// Setting: CERTCTL_RATE_LIMIT_PER_USER_BURST environment variable.
PerUserBurstSize int
// BucketTTL bounds the unused-bucket lifetime in the token-bucket
// map. Idle buckets older than BucketTTL are reclaimed by a
// background sweeper running every (BucketTTL/4). Default 1 hour;
// values < 1 minute are clamped up to 1 minute in the limiter
// constructor. Set this lower if the server faces high-cardinality
// unauthenticated traffic (CGNAT churn, Tor exit lists, scanners)
// and the map RSS becomes a concern.
// SEC-006 closure (Sprint 2, 2026-05-16).
// Setting: CERTCTL_RATE_LIMIT_BUCKET_TTL environment variable.
BucketTTL time.Duration
// SlidingWindowBackend selects which backend implements the
// per-key sliding-window-log limiters wired in cmd/server/main.go
// (break-glass login, OCSP per-IP, cert-export per-actor, EST