fix(middleware): SEC-006 — TTL-evict idle token-bucket rate-limiter entries

Sprint 2 unified-master-audit closure. Pre-fix the keyed rate limiter's bucket map had no eviction. The package-level comment explicitly noted the leak: high-cardinality unauthenticated traffic (CGNAT churn, Tor exit lists, botnets, infinite-cardinality scanners) grew process memory unboundedly. Production deploys with millions of unique IPs would eventually OOM. Fix: - RateLimitConfig.BucketTTL (env CERTCTL_RATE_LIMIT_BUCKET_TTL, default 1h, clamp-floor 1m). 1h chosen to be well above realistic operator IP churn windows (returning clients keep their bucket) and well below the unbounded-leak window the pre-fix code allowed. - tokenBucket gains a lastAccess field updated on every allow() call via touch(); reading via lastAccessTime() under the bucket's own mutex. - keyedRateLimiter.sweepLoop runs in a single goroutine per limiter (production wires 2: default + no-auth fallback), waking every BucketTTL/4. sweep() removes any bucket whose lastAccess is older than the cutoff and bumps evictedTotal atomically. - Both NewRateLimiter call sites in cmd/server/main.go (default stack and no-auth fallback) now thread cfg.RateLimit.BucketTTL. Regression coverage: - TestKeyedRateLimiter_SweepEvictsIdleBuckets: 1000 synthetic IP keys populate the map, advance past TTL, call sweep() directly, assert map drained to 0 + evictedTotal=1000 + fresh key creates new bucket (map not poisoned). - TestKeyedRateLimiter_SweepKeepsActiveBuckets: inverse — a bucket touched within the TTL window survives the sweep. Catches a future regression that inverts the cutoff comparison. Closes SEC-006.
2026-06-11 08:18:54 +00:00 · 2026-05-16 04:01:18 +00:00
parent 037876fa0f
commit 8f2e5771db
5 changed files with 246 additions and 11 deletions
@@ -2,9 +2,11 @@ package middleware

 import (
 	"context"
+	"fmt"
 	"net/http"
 	"net/http/httptest"
 	"testing"
+	"time"

 	"github.com/certctl-io/certctl/internal/auth"
 )
@@ -188,3 +190,94 @@ func TestRateLimiter_M025_EmptyUserKeyTreatedAsAnonymous(t *testing.T) {
 		t.Errorf("second anonymous request from different IP should still pass (independent IP buckets); got %d", rr.Code)
 	}
 }
+
+// =============================================================================
+// SEC-006 closure (Sprint 2, 2026-05-16). The token-bucket map now has
+// a background sweeper that evicts buckets whose last allow() call is
+// older than the configured BucketTTL. This test pins the eviction
+// path against a synthetic 1000-key load and asserts:
+//
+//   1. Buckets created by N distinct keys land in the map.
+//   2. After the simulated TTL elapses and the sweeper runs, the map
+//      is reclaimed and evictedTotal reflects the count.
+//   3. A subsequent request from a fresh key creates a new bucket
+//      (i.e. the map isn't poisoned by the eviction).
+//
+// The test calls sweep() directly rather than relying on the goroutine
+// + time.Ticker so it stays deterministic and fast. The sweeper
+// goroutine itself is exercised in production; this test pins the
+// eviction predicate.
+// =============================================================================
+
+func TestKeyedRateLimiter_SweepEvictsIdleBuckets(t *testing.T) {
+	limiter := &keyedRateLimiter{
+		ipRate:    1000,
+		ipBurst:   1000,
+		userRate:  1000,
+		userBurst: 1000,
+		buckets:   make(map[string]*tokenBucket),
+		bucketTTL: 100 * time.Millisecond,
+	}
+
+	// Populate 1000 buckets from a synthetic IP-key churn.
+	for i := 0; i < 1000; i++ {
+		key := "ip:198.51.100." + fmt.Sprintf("%d", i%256) + "/" + fmt.Sprintf("%d", i)
+		if !limiter.allow(key, false) {
+			t.Fatalf("synthetic IP-key %d: allow returned false on first call", i)
+		}
+	}
+	limiter.mu.RLock()
+	if got := len(limiter.buckets); got != 1000 {
+		limiter.mu.RUnlock()
+		t.Fatalf("post-populate bucket count = %d; want 1000", got)
+	}
+	limiter.mu.RUnlock()
+
+	// Advance past the TTL boundary, then sweep.
+	time.Sleep(110 * time.Millisecond)
+	limiter.sweep()
+
+	limiter.mu.RLock()
+	remaining := len(limiter.buckets)
+	limiter.mu.RUnlock()
+	if remaining != 0 {
+		t.Errorf("post-sweep bucket count = %d; want 0 (all should have been evicted)", remaining)
+	}
+	if got := limiter.evictedTotal.Load(); got != 1000 {
+		t.Errorf("evictedTotal = %d; want 1000", got)
+	}
+
+	// A fresh request creates a new bucket — map isn't poisoned.
+	if !limiter.allow("ip:203.0.113.7", false) {
+		t.Errorf("fresh key: allow returned false on first call after sweep")
+	}
+	limiter.mu.RLock()
+	defer limiter.mu.RUnlock()
+	if got := len(limiter.buckets); got != 1 {
+		t.Errorf("post-sweep-plus-one bucket count = %d; want 1", got)
+	}
+}
+
+// TestKeyedRateLimiter_SweepKeepsActiveBuckets pins the inverse — a
+// bucket touched within the TTL window survives the sweep. Catches a
+// future regression that inverts the cutoff comparison.
+func TestKeyedRateLimiter_SweepKeepsActiveBuckets(t *testing.T) {
+	limiter := &keyedRateLimiter{
+		ipRate:    1000,
+		ipBurst:   1000,
+		userRate:  1000,
+		userBurst: 1000,
+		buckets:   make(map[string]*tokenBucket),
+		bucketTTL: 1 * time.Hour, // generous so test timing doesn't flake
+	}
+	limiter.allow("ip:198.51.100.42", false)
+	limiter.sweep()
+	limiter.mu.RLock()
+	defer limiter.mu.RUnlock()
+	if got := len(limiter.buckets); got != 1 {
+		t.Errorf("active-bucket count = %d; want 1 (sweep should not evict within TTL)", got)
+	}
+	if got := limiter.evictedTotal.Load(); got != 0 {
+		t.Errorf("evictedTotal = %d; want 0 (no evictions expected)", got)
+	}
+}