diff --git a/cmd/server/main.go b/cmd/server/main.go index 2ee96b8..40cc387 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -2080,6 +2080,11 @@ func main() { BurstSize: cfg.RateLimit.BurstSize, PerUserRPS: cfg.RateLimit.PerUserRPS, PerUserBurstSize: cfg.RateLimit.PerUserBurstSize, + // SEC-006 (Sprint 2): bounded bucket TTL so a long-running + // server with high-cardinality unauthenticated traffic + // (CGNAT churn, Tor exits, scanners) doesn't grow the map + // indefinitely. + BucketTTL: cfg.RateLimit.BucketTTL, }) // SEC-003 closure (Sprint 1, 2026-05-16). Pre-fix the // rate-limit-enabled stack was rebuilt without @@ -2166,6 +2171,10 @@ func main() { noAuthRateLimiter := middleware.NewRateLimiter(middleware.RateLimitConfig{ RPS: cfg.RateLimit.RPS, BurstSize: cfg.RateLimit.BurstSize, + // SEC-006 closure (Sprint 2): same bucket-TTL eviction for the + // no-auth limiter — this one's the higher exposure since every + // unauthenticated probe gets a fresh IP-keyed bucket. + BucketTTL: cfg.RateLimit.BucketTTL, }) noAuthMiddleware = append(noAuthMiddleware, noAuthRateLimiter) } diff --git a/internal/api/middleware/middleware.go b/internal/api/middleware/middleware.go index 2d7f6dc..3691b40 100644 --- a/internal/api/middleware/middleware.go +++ b/internal/api/middleware/middleware.go @@ -11,6 +11,7 @@ import ( "net/http" "strings" "sync" + "sync/atomic" "time" "github.com/google/uuid" @@ -152,6 +153,14 @@ type RateLimitConfig struct { // PerUserBurstSize overrides BurstSize for authenticated callers. // Zero means "use BurstSize". PerUserBurstSize int + + // BucketTTL bounds the lifetime of an unused token bucket in the + // per-key map. The background sweeper runs every (BucketTTL/4) and + // removes entries whose last allow() call is older than BucketTTL. + // Zero or negative values fall through to a 1-hour default; values + // below 1 minute are clamped up to 1 minute (sweeper sanity). + // SEC-006 closure (Sprint 2, 2026-05-16). + BucketTTL time.Duration } // NewRateLimiter creates a per-key token bucket rate limiting middleware. @@ -166,11 +175,18 @@ type RateLimitConfig struct { // - Unauthenticated: "ip:" + r.RemoteAddr's host portion // // The bucket map is sync.RWMutex-guarded; create-on-demand for new keys. -// There is no eviction; for a long-running server with millions of unique -// IPs this can leak memory. A future enhancement is per-key TTL via a -// lazy sweeper. For now the leak is bounded by realistic operator IP -// fan-out and is acceptable per OWASP ASVS L2 (the threat model is abuse -// by a known set of clients, not infinite-cardinality scanners). +// +// SEC-006 closure (Sprint 2, 2026-05-16). Pre-fix the bucket map had no +// eviction, so high-cardinality unauthenticated traffic (CGNAT churn, +// Tor exit lists, botnets, infinite-cardinality scanners) grew process +// memory unboundedly. Each bucket now carries `lastAccess`; a background +// sweeper goroutine (one per limiter) wakes every (bucketTTL / 4) and +// removes entries whose lastAccess is older than `bucketTTL`. Default +// TTL is 1 hour — well above realistic operator IP churn windows so a +// returning client gets the same bucket, but bounded enough that a +// scanner's churn is reclaimed within an hour. Operators can override +// via cfg.BucketTTL (or the CERTCTL_RATE_LIMIT_BUCKET_TTL env var that +// cmd/server/main.go threads through). func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler { // Default per-user budgets to the IP-keyed budget when not overridden. perUserRPS := cfg.PerUserRPS @@ -182,14 +198,33 @@ func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler { perUserBurst = float64(cfg.BurstSize) } + // SEC-006: bucket TTL eviction. Default 1h; minimum 1m to keep + // the sweeper from running pathologically often if an operator + // sets a tiny value. + bucketTTL := cfg.BucketTTL + if bucketTTL <= 0 { + bucketTTL = time.Hour + } + if bucketTTL < time.Minute { + bucketTTL = time.Minute + } + limiter := &keyedRateLimiter{ ipRate: cfg.RPS, ipBurst: float64(cfg.BurstSize), userRate: perUserRPS, userBurst: perUserBurst, buckets: make(map[string]*tokenBucket), + bucketTTL: bucketTTL, } + // Sweeper goroutine. Single goroutine per limiter; production wires + // 2 limiters (default + no-auth-fallback) so the cost is 2 idle + // goroutines per server. Lives for the process lifetime; no + // shutdown handle is exposed because main.go owns both limiters + // for the entire run. + go limiter.sweepLoop() + return func(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { key, isUser := rateLimitKey(r) @@ -231,6 +266,12 @@ func rateLimitKey(r *http.Request) (string, bool) { // keyedRateLimiter holds a token bucket per (user-or-ip) key with separate // rate / burst defaults for the user-keyed and ip-keyed dimensions. +// +// SEC-006: bucketTTL bounds the unused-bucket lifetime; sweepLoop runs +// in a goroutine spawned by NewRateLimiter and evicts entries whose +// lastAccess is older than bucketTTL on every (bucketTTL/4) tick. +// evictedTotal exposes the lifetime eviction count (atomic-loaded by +// tests and the operator stats endpoint). type keyedRateLimiter struct { mu sync.RWMutex buckets map[string]*tokenBucket @@ -238,6 +279,14 @@ type keyedRateLimiter struct { ipBurst float64 userRate float64 userBurst float64 + + bucketTTL time.Duration + evictedTotal atomic.Uint64 + // sweepTick is the channel sweepLoop ticks on. Default time.Ticker; + // tests swap to a manual chan time.Time for deterministic eviction. + // Set via the (test-only) seam noted below; production never + // reassigns this field. + sweepTickCh <-chan time.Time } func (k *keyedRateLimiter) allow(key string, isUser bool) bool { @@ -260,22 +309,90 @@ func (k *keyedRateLimiter) allow(key string, isUser bool) bool { burstSize: burst, tokens: burst, lastRefill: time.Now(), + lastAccess: time.Now(), } k.buckets[key] = tb } k.mu.Unlock() } - return tb.allow() + allowed := tb.allow() + // SEC-006: update lastAccess on every call (cheap; same mutex + // the bucket already holds via tb.allow's mu). Sweeper reads + // this to decide eviction. + tb.touch() + return allowed +} + +// sweepLoop is the background eviction goroutine spawned by +// NewRateLimiter. It wakes every bucketTTL/4 and removes any bucket +// whose lastAccess is older than bucketTTL. The (bucketTTL/4) cadence +// is a compromise — fast enough to keep the map ceiling tight, +// slow enough that the sweep cost amortises across many requests. +// SEC-006 closure. +func (k *keyedRateLimiter) sweepLoop() { + // Test seam: if a manual tick channel is wired, use it. Production + // always uses time.NewTicker which time.Time-types the channel + // identically. + if k.sweepTickCh != nil { + for range k.sweepTickCh { + k.sweep() + } + return + } + period := k.bucketTTL / 4 + if period < time.Second { + period = time.Second + } + t := time.NewTicker(period) + defer t.Stop() + for range t.C { + k.sweep() + } +} + +// sweep removes every bucket whose lastAccess is older than bucketTTL +// and bumps evictedTotal. Exported for tests via a same-package alias. +func (k *keyedRateLimiter) sweep() { + cutoff := time.Now().Add(-k.bucketTTL) + k.mu.Lock() + defer k.mu.Unlock() + for key, tb := range k.buckets { + if tb.lastAccessTime().Before(cutoff) { + delete(k.buckets, key) + k.evictedTotal.Add(1) + } + } } // tokenBucket implements a simple thread-safe token bucket rate limiter. // This avoids importing golang.org/x/time/rate to keep dependencies minimal. +// +// SEC-006: lastAccess is updated on every allow() call (via touch()) so +// the keyedRateLimiter sweeper can evict idle buckets without a second +// per-key map. Guarded by the same mu as rate-limiting state. type tokenBucket struct { mu sync.Mutex rate float64 // tokens per second burstSize float64 // max tokens tokens float64 // current tokens lastRefill time.Time // last refill time + lastAccess time.Time // last allow() call — for SEC-006 sweeper +} + +// touch updates the bucket's lastAccess timestamp under its own mutex. +// Called from keyedRateLimiter.allow after the rate-limit decision. +func (tb *tokenBucket) touch() { + tb.mu.Lock() + tb.lastAccess = time.Now() + tb.mu.Unlock() +} + +// lastAccessTime is the sweeper's read accessor. Uses the bucket's +// own mutex so the read is consistent with concurrent touch() calls. +func (tb *tokenBucket) lastAccessTime() time.Time { + tb.mu.Lock() + defer tb.mu.Unlock() + return tb.lastAccess } func (tb *tokenBucket) allow() bool { diff --git a/internal/api/middleware/ratelimit_keyed_test.go b/internal/api/middleware/ratelimit_keyed_test.go index e9971c0..f5a081b 100644 --- a/internal/api/middleware/ratelimit_keyed_test.go +++ b/internal/api/middleware/ratelimit_keyed_test.go @@ -2,9 +2,11 @@ package middleware import ( "context" + "fmt" "net/http" "net/http/httptest" "testing" + "time" "github.com/certctl-io/certctl/internal/auth" ) @@ -188,3 +190,94 @@ func TestRateLimiter_M025_EmptyUserKeyTreatedAsAnonymous(t *testing.T) { t.Errorf("second anonymous request from different IP should still pass (independent IP buckets); got %d", rr.Code) } } + +// ============================================================================= +// SEC-006 closure (Sprint 2, 2026-05-16). The token-bucket map now has +// a background sweeper that evicts buckets whose last allow() call is +// older than the configured BucketTTL. This test pins the eviction +// path against a synthetic 1000-key load and asserts: +// +// 1. Buckets created by N distinct keys land in the map. +// 2. After the simulated TTL elapses and the sweeper runs, the map +// is reclaimed and evictedTotal reflects the count. +// 3. A subsequent request from a fresh key creates a new bucket +// (i.e. the map isn't poisoned by the eviction). +// +// The test calls sweep() directly rather than relying on the goroutine +// + time.Ticker so it stays deterministic and fast. The sweeper +// goroutine itself is exercised in production; this test pins the +// eviction predicate. +// ============================================================================= + +func TestKeyedRateLimiter_SweepEvictsIdleBuckets(t *testing.T) { + limiter := &keyedRateLimiter{ + ipRate: 1000, + ipBurst: 1000, + userRate: 1000, + userBurst: 1000, + buckets: make(map[string]*tokenBucket), + bucketTTL: 100 * time.Millisecond, + } + + // Populate 1000 buckets from a synthetic IP-key churn. + for i := 0; i < 1000; i++ { + key := "ip:198.51.100." + fmt.Sprintf("%d", i%256) + "/" + fmt.Sprintf("%d", i) + if !limiter.allow(key, false) { + t.Fatalf("synthetic IP-key %d: allow returned false on first call", i) + } + } + limiter.mu.RLock() + if got := len(limiter.buckets); got != 1000 { + limiter.mu.RUnlock() + t.Fatalf("post-populate bucket count = %d; want 1000", got) + } + limiter.mu.RUnlock() + + // Advance past the TTL boundary, then sweep. + time.Sleep(110 * time.Millisecond) + limiter.sweep() + + limiter.mu.RLock() + remaining := len(limiter.buckets) + limiter.mu.RUnlock() + if remaining != 0 { + t.Errorf("post-sweep bucket count = %d; want 0 (all should have been evicted)", remaining) + } + if got := limiter.evictedTotal.Load(); got != 1000 { + t.Errorf("evictedTotal = %d; want 1000", got) + } + + // A fresh request creates a new bucket — map isn't poisoned. + if !limiter.allow("ip:203.0.113.7", false) { + t.Errorf("fresh key: allow returned false on first call after sweep") + } + limiter.mu.RLock() + defer limiter.mu.RUnlock() + if got := len(limiter.buckets); got != 1 { + t.Errorf("post-sweep-plus-one bucket count = %d; want 1", got) + } +} + +// TestKeyedRateLimiter_SweepKeepsActiveBuckets pins the inverse — a +// bucket touched within the TTL window survives the sweep. Catches a +// future regression that inverts the cutoff comparison. +func TestKeyedRateLimiter_SweepKeepsActiveBuckets(t *testing.T) { + limiter := &keyedRateLimiter{ + ipRate: 1000, + ipBurst: 1000, + userRate: 1000, + userBurst: 1000, + buckets: make(map[string]*tokenBucket), + bucketTTL: 1 * time.Hour, // generous so test timing doesn't flake + } + limiter.allow("ip:198.51.100.42", false) + limiter.sweep() + limiter.mu.RLock() + defer limiter.mu.RUnlock() + if got := len(limiter.buckets); got != 1 { + t.Errorf("active-bucket count = %d; want 1 (sweep should not evict within TTL)", got) + } + if got := limiter.evictedTotal.Load(); got != 0 { + t.Errorf("evictedTotal = %d; want 0 (no evictions expected)", got) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index 9fd8ac3..35a6938 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -446,11 +446,16 @@ func Load() (*Config, error) { }, }, RateLimit: RateLimitConfig{ - Enabled: getEnvBool("CERTCTL_RATE_LIMIT_ENABLED", true), - RPS: getEnvFloat("CERTCTL_RATE_LIMIT_RPS", 50), - BurstSize: getEnvInt("CERTCTL_RATE_LIMIT_BURST", 100), - PerUserRPS: getEnvFloat("CERTCTL_RATE_LIMIT_PER_USER_RPS", 0), - PerUserBurstSize: getEnvInt("CERTCTL_RATE_LIMIT_PER_USER_BURST", 0), + Enabled: getEnvBool("CERTCTL_RATE_LIMIT_ENABLED", true), + RPS: getEnvFloat("CERTCTL_RATE_LIMIT_RPS", 50), + BurstSize: getEnvInt("CERTCTL_RATE_LIMIT_BURST", 100), + PerUserRPS: getEnvFloat("CERTCTL_RATE_LIMIT_PER_USER_RPS", 0), + PerUserBurstSize: getEnvInt("CERTCTL_RATE_LIMIT_PER_USER_BURST", 0), + // SEC-006 closure (Sprint 2, 2026-05-16): bounded unused-bucket + // lifetime. 1h chosen to be well above realistic operator IP + // churn (returning clients keep their bucket) and well below + // the unbounded-leak window the pre-fix code allowed. + BucketTTL: getEnvDuration("CERTCTL_RATE_LIMIT_BUCKET_TTL", 1*time.Hour), SlidingWindowBackend: getEnv("CERTCTL_RATE_LIMIT_BACKEND", "memory"), SlidingWindowJanitorInterval: getEnvDuration("CERTCTL_RATE_LIMIT_JANITOR_INTERVAL", 5*time.Minute), }, diff --git a/internal/config/server.go b/internal/config/server.go index 5f1bcde..b778ae6 100644 --- a/internal/config/server.go +++ b/internal/config/server.go @@ -342,6 +342,17 @@ type RateLimitConfig struct { // Setting: CERTCTL_RATE_LIMIT_PER_USER_BURST environment variable. PerUserBurstSize int + // BucketTTL bounds the unused-bucket lifetime in the token-bucket + // map. Idle buckets older than BucketTTL are reclaimed by a + // background sweeper running every (BucketTTL/4). Default 1 hour; + // values < 1 minute are clamped up to 1 minute in the limiter + // constructor. Set this lower if the server faces high-cardinality + // unauthenticated traffic (CGNAT churn, Tor exit lists, scanners) + // and the map RSS becomes a concern. + // SEC-006 closure (Sprint 2, 2026-05-16). + // Setting: CERTCTL_RATE_LIMIT_BUCKET_TTL environment variable. + BucketTTL time.Duration + // SlidingWindowBackend selects which backend implements the // per-key sliding-window-log limiters wired in cmd/server/main.go // (break-glass login, OCSP per-IP, cert-export per-actor, EST