diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57e7c94..cba0aab 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -132,6 +132,18 @@ jobs: run: | go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/api/router/... ./internal/auth/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... ./internal/ciparity/... -count=1 -cover -coverprofile=coverage.out + - name: Multi-replica rate-limit integration test (Phase 13 Sprint 13.2/13.3 — ARCH-M1 closure proof) + # The falsifiable proof that CERTCTL_RATE_LIMIT_BACKEND=postgres + # enforces caps cluster-wide. testcontainers-go spins one + # Postgres container; 3 *PostgresSlidingWindowLimiter instances + # share it; 100 concurrent Allow("test-key") with cap=10 must + # see exactly 10 succeed + 90 ErrRateLimited. Failure here = + # the row-lock arbitration broke; ARCH-M1 closure is invalid. + run: | + go test -tags=integration -race -count=1 -timeout=300s \ + -run TestRateLimit_PostgresBackend_CapEnforcedAcrossReplicas \ + ./internal/integration/... + - name: Check Coverage Thresholds # ci-pipeline-cleanup Phase 2: per-package floors moved to # .github/coverage-thresholds.yml. Each entry has `floor:` + diff --git a/cmd/server/main.go b/cmd/server/main.go index 7ac5ec7..d387644 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -577,7 +577,7 @@ func main() { // AuthExemptRouterRoutes path. The service-layer Argon2id lockout // state machine remains the second line of defense. breakglassHandler.SetLoginRateLimiter( - ratelimit.NewSlidingWindowLimiter(5, time.Minute, 50_000), + ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, 5, time.Minute, 50_000), ) if cfg.Auth.Breakglass.Enabled { logger.Warn("CERTCTL_BREAKGLASS_ENABLED=true — break-glass admin path is ACTIVE; this bypasses SSO. Disable in steady-state.", @@ -1000,7 +1000,7 @@ func main() { // Production hardening II Phase 3: per-source-IP OCSP rate limit. // Window 1m so the cap counts requests per minute. Map cap 50k // matches the SCEP/Intune replay cache cap. Zero disables. - ocspLimiter := ratelimit.NewSlidingWindowLimiter(cfg.Scheduler.OCSPRateLimitPerIPMin, time.Minute, 50_000) + ocspLimiter := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, cfg.Scheduler.OCSPRateLimitPerIPMin, time.Minute, 50_000) certificateHandler.SetOCSPRateLimiter(ocspLimiter) issuerHandler := handler.NewIssuerHandler(issuerService) targetHandler := handler.NewTargetHandler(targetService) @@ -1065,7 +1065,7 @@ func main() { exportHandler := handler.NewExportHandler(exportService) // Production hardening II Phase 3: per-actor cert-export rate limit. // Window 1h so the cap counts exports per hour. Zero disables. - exportLimiter := ratelimit.NewSlidingWindowLimiter(cfg.Scheduler.CertExportRateLimitPerActorHr, time.Hour, 50_000) + exportLimiter := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, cfg.Scheduler.CertExportRateLimitPerActorHr, time.Hour, 50_000) exportHandler.SetExportRateLimiter(exportLimiter) bulkRevocationHandler := handler.NewBulkRevocationHandler(bulkRevocationService) @@ -1209,6 +1209,29 @@ func main() { sched.SetSessionGarbageCollector(sessionService) sched.SetBCLReplayGarbageCollector(bclReplayRepo) // Audit 2026-05-10 HIGH-3. sched.SetSessionGCInterval(cfg.Auth.Session.GCInterval) + + // Phase 13 Sprint 13.3 closure (ARCH-M1): when the operator selected + // CERTCTL_RATE_LIMIT_BACKEND=postgres, wire the bucket janitor so + // stale rows from rate_limit_buckets get swept on the configured + // interval. The in-memory backend's prune-on-Allow path keeps + // buckets short-lived without a separate sweep, so we skip the + // loop entirely for backend=memory. + // + // maxWindow = 24h: the EST per-principal limiter is the longest + // window any current caller configures (the breakglass / OCSP / + // export / EST failed-basic limiters use shorter windows). Bump + // this if a new caller introduces a longer window — rows pruned + // inside their window aren't deletable. + if cfg.RateLimit.SlidingWindowBackend == "postgres" { + rateLimitGC := ratelimit.NewPostgresGC(db, 24*time.Hour) + sched.SetRateLimitGarbageCollector(rateLimitGC) + sched.SetRateLimitGCInterval(cfg.RateLimit.SlidingWindowJanitorInterval) + logger.Info("rate-limit GC sweep enabled (postgres backend)", + "interval", cfg.RateLimit.SlidingWindowJanitorInterval.String(), + "max_window", "24h") + } else { + logger.Info("rate-limit backend = memory; postgres GC sweep not wired (in-memory backend self-prunes)") + } logger.Info("session GC sweep enabled", "interval", cfg.Auth.Session.GCInterval.String(), "absolute_timeout", cfg.Auth.Session.AbsoluteTimeout.String(), @@ -1532,7 +1555,7 @@ func main() { // release. The shared SlidingWindowLimiter applies the same // math the SCEP/Intune limiter uses — extracted in Phase 4.1 // of this bundle so both call sites share the implementation. - failed := ratelimit.NewSlidingWindowLimiter(10, time.Hour, 50_000) + failed := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, 10, time.Hour, 50_000) estHandler.SetSourceIPRateLimiter(failed) } // Phase 2.1: mTLS sibling route. When MTLSEnabled=true, build a @@ -1588,7 +1611,7 @@ func main() { mtlsHandler.SetChannelBindingRequired(profile.ChannelBindingRequired) mtlsHandler.SetServerKeygenEnabled(profile.ServerKeygenEnabled) if profile.RateLimitPerPrincipal24h > 0 { - perPrincipal := ratelimit.NewSlidingWindowLimiter(profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000) + perPrincipal := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000) mtlsHandler.SetPerPrincipalRateLimiter(perPrincipal) } estMTLSHandlers[profile.PathID] = mtlsHandler @@ -1610,7 +1633,7 @@ func main() { // when configured). The mTLS handler above gets its own // limiter instance so the two routes don't share a bucket. if profile.RateLimitPerPrincipal24h > 0 { - perPrincipal := ratelimit.NewSlidingWindowLimiter(profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000) + perPrincipal := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000) estHandler.SetPerPrincipalRateLimiter(perPrincipal) } estHandlers[profile.PathID] = estHandler diff --git a/deploy/helm/certctl/templates/server-configmap.yaml b/deploy/helm/certctl/templates/server-configmap.yaml index a04fdb9..fee00c9 100644 --- a/deploy/helm/certctl/templates/server-configmap.yaml +++ b/deploy/helm/certctl/templates/server-configmap.yaml @@ -12,6 +12,8 @@ data: keygen-mode: {{ .Values.server.keygen.mode | quote }} rate-limit-rps: {{ .Values.server.rateLimiting.rps | quote }} rate-limit-burst: {{ .Values.server.rateLimiting.burst | quote }} + rate-limit-backend: {{ .Values.server.rateLimiting.backend | default "memory" | quote }} + rate-limit-janitor-interval: {{ .Values.server.rateLimiting.janitorInterval | default "5m" | quote }} {{- if .Values.server.cors.origins }} cors-origins: {{ .Values.server.cors.origins | quote }} {{- end }} diff --git a/deploy/helm/certctl/templates/server-deployment.yaml b/deploy/helm/certctl/templates/server-deployment.yaml index 41e790d..e32eb62 100644 --- a/deploy/helm/certctl/templates/server-deployment.yaml +++ b/deploy/helm/certctl/templates/server-deployment.yaml @@ -108,6 +108,19 @@ spec: configMapKeyRef: name: {{ include "certctl.fullname" . }}-server key: rate-limit-burst + # Phase 13 Sprint 13.3 (ARCH-M1) — cross-replica-consistent + # sliding-window rate limiter. Default memory; flip to + # postgres when server.replicas > 1. + - name: CERTCTL_RATE_LIMIT_BACKEND + valueFrom: + configMapKeyRef: + name: {{ include "certctl.fullname" . }}-server + key: rate-limit-backend + - name: CERTCTL_RATE_LIMIT_JANITOR_INTERVAL + valueFrom: + configMapKeyRef: + name: {{ include "certctl.fullname" . }}-server + key: rate-limit-janitor-interval {{- if .Values.server.cors.origins }} - name: CERTCTL_CORS_ORIGINS valueFrom: diff --git a/deploy/helm/certctl/values.yaml b/deploy/helm/certctl/values.yaml index ebfab05..ad5db0e 100644 --- a/deploy/helm/certctl/values.yaml +++ b/deploy/helm/certctl/values.yaml @@ -211,8 +211,25 @@ server: # Rate limiting configuration rateLimiting: - rps: 100 # Requests per second - burst: 200 # Burst capacity + rps: 100 # Requests per second (token-bucket middleware) + burst: 200 # Burst capacity (token-bucket middleware) + + # Sliding-window-log rate-limit backend (Phase 13 Sprint 13.2/13.3 + # ARCH-M1 closure). Selects the implementation backing the + # break-glass / OCSP / cert-export / EST limiters. See + # docs/operator/observability.md for the operator decision tree. + # + # memory — per-process (default; single-replica deploys). + # postgres — cross-replica-consistent via rate_limit_buckets. + # REQUIRED when server.replicas > 1 for accurate + # cluster-wide enforcement. + backend: memory + + # Scheduler janitor interval for the postgres backend's + # rate_limit_buckets sweep. Ignored when backend=memory (the + # in-memory backend self-prunes on every Allow call). + # Default 5m; minimum 1m. + janitorInterval: "5m" # Network scanning configuration networkScan: diff --git a/docs/operator/observability.md b/docs/operator/observability.md index 387111b..04dd7ed 100644 --- a/docs/operator/observability.md +++ b/docs/operator/observability.md @@ -121,52 +121,142 @@ explicitly scrubs the password before it reaches the audit subsystem (see [`docs/operator/auth-threat-model.md`](auth-threat-model.md) § "Break-glass token leak"). -## Rate-limit behavior under restarts and replicas +## Rate-limit behavior — configurable backend (memory or postgres) -Where rate limits exist, they are **per-process, in-memory, -reset-on-restart, and not shared across replicas**. This matters for -multi-replica deployments and for any compliance posture that asks -"what limits apply globally vs per-pod." +The sliding-window-log rate limiters used across certctl's +authenticated-but-shared-credential code paths (break-glass login, +OCSP per-IP, cert-export per-actor, EST per-principal, EST +failed-basic source-IP) carry a **configurable backend**. The +operator picks between two implementations via +`CERTCTL_RATE_LIMIT_BACKEND`: + +| Value | When to use | +|------------|------------------------------------------------------| +| `memory` | Default. Single-replica deploys; sketchpad / dev. | +| `postgres` | HA deploys (`server.replicas > 1`). Cross-replica-consistent. | + +Phase 13 Sprint 13.2/13.3 (architecture diligence audit ARCH-M1 +closure) replaced the prior single-process limitation with a +substantive close: when the operator opts into `postgres`, all +replicas share the same +`rate_limit_buckets` table (migration 000046) and per-key access is +arbitrated via `SELECT FOR UPDATE` row locks. A 3-replica cluster +hitting one rate-limited endpoint concurrently sees exactly the +configured cap succeed across the cluster — not 3× the cap as the +old per-process backend would have allowed. + +### Operator decision tree + +``` +Single replica (server.replicas = 1, the helm chart default)? + └─ Use CERTCTL_RATE_LIMIT_BACKEND=memory (the default; no action + required). Bucket lookups stay in-process; zero DB round-trips + on the hot path. + +Two or more replicas? + └─ Use CERTCTL_RATE_LIMIT_BACKEND=postgres. Two extra DB round-trips + per Allow call (BEGIN ... SELECT FOR UPDATE ... UPDATE ... COMMIT); + acceptable on the gated hot path. The Sprint 13.2 multi-replica + integration test pins exactly-cap enforcement across N replicas + as the closure proof. +``` ### Inventory -| Limiter | Scope | Window | Cap | Survives restart? | Shared across replicas? | -|---|---|---|---|---|---| -| Break-glass login (per source-IP) | `internal/api/handler/auth_breakglass.go` | 60s | 5 attempts | No | No | -| SCEP/Intune per-device challenge | `internal/scep/intune/` | 60s | configurable (`*_PER_MINUTE`) | No | No | -| EST per-principal CSR enrollment | `internal/est/` | 60s | configurable | No | No | -| EST HTTP-Basic source-IP failed-auth | `internal/est/` | 60s | configurable | No | No | -| ACME per-account orders / key-change / challenge-respond | `internal/service/acme.go` | 1h | configurable | No | No | +| Limiter | Scope | Window | Cap | +|---|---|---|---| +| Break-glass login (per source-IP) | `internal/api/handler/auth_breakglass.go` | 60s | 5 attempts | +| OCSP query (per source-IP) | `internal/api/handler/certificates.go` | 60s | configurable (`CERTCTL_OCSP_RATE_LIMIT_PER_IP_MIN`) | +| Cert export (per actor) | `internal/api/handler/export.go` | 1h | configurable (`CERTCTL_CERT_EXPORT_RATE_LIMIT_PER_ACTOR_HR`) | +| EST per-principal CSR enrollment | `internal/api/handler/est.go` | 24h | configurable (per-profile `RateLimitPerPrincipal24h`) | +| EST HTTP-Basic source-IP failed-auth | `internal/api/handler/est.go` | 60m | 10 attempts | +| SCEP/Intune per-device challenge | `internal/scep/intune/` | 60s | configurable (`*_PER_MINUTE`) | +| ACME per-account orders / key-change / challenge-respond | `internal/service/acme.go` | 1h | configurable | -All five use the shared `internal/ratelimit/sliding_window.go` -primitive. Buckets live in a single per-process map guarded by a -mutex; the package-level cap prevents unbounded growth under -adversarial key cardinality (default 100,000 keys; oldest-by-newest- -timestamp evicted under pressure). +The `CERTCTL_RATE_LIMIT_BACKEND` selector applies to the first five +(the cmd/server-wired limiters). The SCEP/Intune wrapper + the ACME +per-account limiter ride their own internal accounting today; both +are tracked as follow-ups in WORKSPACE-ROADMAP.md. -### Implications for multi-replica deployments +### Backend internals -- **Effective per-replica cap is the documented cap.** A 2-replica - deployment lets through up to 2× the per-key window cap before - either replica rejects. -- **Restart resets the bucket.** A `kubectl rollout restart` empties - the in-memory windows on every replica. An attacker who notices - this could in principle re-issue burst attempts after every roll; - the threat model accepts this because rollouts are operator-driven - and the relevant endpoints already require credentials. -- **No cross-replica fan-out.** Rate-limit decisions on replica A - are not visible to replica B. Sticky-session ingress routing (with - `service.spec.sessionAffinity: ClientIP` on Kubernetes or the - equivalent on your load balancer) tightens the effective cap to - per-replica + per-source-IP rather than per-replica + per-source-IP - for whichever pod the request happened to land on. +Both backends share the algorithm: sliding-window log + per-key +bucket + prune-on-Allow. -If your threat model requires globally-enforced rate limits across -replicas, the implementation surface is roughly: swap the per-process -map for a database-backed sliding window (or a Redis-backed equivalent -if you already run Redis). This is on the -[WORKSPACE-ROADMAP.md](../../WORKSPACE-ROADMAP.md) as a v3 item; -nothing in the certctl threat model today requires it. +**Memory backend (`memory`)** — per-process map keyed by bucket key; +mutex-guarded; package-level LRU cap prevents unbounded growth under +adversarial key cardinality (default 100,000 keys per limiter +instance; oldest-by-newest-timestamp evicted under pressure). +Implemented at `internal/ratelimit/sliding_window.go`. + +**Postgres backend (`postgres`)** — same algorithm against the +`rate_limit_buckets` table: + +```sql +CREATE TABLE rate_limit_buckets ( + bucket_key TEXT PRIMARY KEY, + timestamps TIMESTAMPTZ[] NOT NULL DEFAULT '{}', + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); +``` + +`Allow(key, now)` opens a transaction, ensures the row exists +(`INSERT ... ON CONFLICT DO NOTHING`), acquires the row lock +(`SELECT ... FOR UPDATE`), prunes timestamps older than `now-window`, +compares the post-prune count against `maxN`, conditionally appends +`now`, persists, and commits. The row lock is what arbitrates across +replicas: replicas A and B firing simultaneous `Allow("k")` never +race because Postgres serializes the per-key row update across the +cluster. Implemented at +`internal/ratelimit/postgres_sliding_window.go`. + +### Janitor sweep (postgres backend only) + +The scheduler runs a `rate_limit_buckets` janitor every +`CERTCTL_RATE_LIMIT_JANITOR_INTERVAL` (default 5m, minimum 1m). The +sweep deletes rows whose `updated_at` is older than the longest +configured window any limiter uses (24h today, matching the EST +per-principal limiter). Idempotent; repeated sweeps find zero rows. +The memory backend's prune-on-Allow path keeps buckets short-lived +without a separate sweep, so the loop is a no-op when +`backend=memory`. + +### Falsifiable closure proof + +The Phase 13 Sprint 13.2 integration test +`internal/integration/ratelimit_multi_replica_test.go` +(`//go:build integration`) fires 100 concurrent `Allow("test-key")` +calls round-robined across 3 independent `PostgresSlidingWindowLimiter` +instances sharing one Postgres database (`cap=10`, `window=1m`) and +asserts exactly 10 succeed + 90 return `ErrRateLimited`. If the +cross-replica row lock weren't arbitrating, each replica would +independently let through ~3-4 requests, giving 12-15 successes +total. Re-run: + +``` +go test -tags=integration -count=1 -run TestRateLimit_MultiReplica \ + ./internal/integration/... +``` + +### Helm chart wiring + +The helm chart at `deploy/helm/certctl/` exposes the backend via +`server.rateLimiting.backend` (default `memory`). To opt into the +postgres backend for an HA deploy: + +``` +helm upgrade --install certctl deploy/helm/certctl \ + --set server.replicas=3 \ + --set server.rateLimiting.backend=postgres \ + --set server.rateLimiting.janitorInterval=5m +``` + +`server.replicas > 1` without flipping `backend` to `postgres` works +fine — the limits stay per-process — but the operator gets a 2× / +3× / Nx effective cap depending on replica count. The chart does NOT +auto-flip on `replicas > 1` because some HA deploys deliberately want +per-process limits (sticky-session ingress + tight per-replica caps +to detect bot traffic at the edge before it hits the application). ### Where these numbers live diff --git a/internal/api/handler/auth_breakglass.go b/internal/api/handler/auth_breakglass.go index 350aea6..e693847 100644 --- a/internal/api/handler/auth_breakglass.go +++ b/internal/api/handler/auth_breakglass.go @@ -78,7 +78,7 @@ type AuthBreakglassHandler struct { // nil-safe: when unset, the handler skips the limiter check and // relies on the service-layer Argon2id lockout. Production deploys // MUST set this via SetLoginRateLimiter. - loginLimiter *ratelimit.SlidingWindowLimiter + loginLimiter ratelimit.Limiter } // NewAuthBreakglassHandler constructs the handler. @@ -89,7 +89,7 @@ func NewAuthBreakglassHandler(svc BreakglassService, cookieAttrs SessionCookieAt // SetLoginRateLimiter wires the per-source-IP rate limiter the Login // handler enforces. Bundle 5 closure (S1) — see the AuthBreakglassHandler // type docstring for the full rationale. -func (h *AuthBreakglassHandler) SetLoginRateLimiter(l *ratelimit.SlidingWindowLimiter) { +func (h *AuthBreakglassHandler) SetLoginRateLimiter(l ratelimit.Limiter) { h.loginLimiter = l } diff --git a/internal/api/handler/certificates.go b/internal/api/handler/certificates.go index 94be2b6..2d45b6e 100644 --- a/internal/api/handler/certificates.go +++ b/internal/api/handler/certificates.go @@ -52,7 +52,7 @@ type CertificateService interface { // CertificateHandler handles HTTP requests for certificate operations. type CertificateHandler struct { svc CertificateService - ocspLimiter *ratelimit.SlidingWindowLimiter // production hardening II Phase 3 — per-source-IP cap on OCSP + ocspLimiter ratelimit.Limiter // production hardening II Phase 3 — per-source-IP cap on OCSP } // NewCertificateHandler creates a new CertificateHandler with a service dependency. @@ -65,7 +65,7 @@ func NewCertificateHandler(svc CertificateService) CertificateHandler { // cmd/server/main.go): 1000 req/min/IP. Setting to nil disables the // limit; the limiter's own NewSlidingWindowLimiter(maxN<=0, ...) // also produces a no-op limiter, so the env-var-zero case is safe. -func (h *CertificateHandler) SetOCSPRateLimiter(l *ratelimit.SlidingWindowLimiter) { +func (h *CertificateHandler) SetOCSPRateLimiter(l ratelimit.Limiter) { h.ocspLimiter = l } diff --git a/internal/api/handler/est.go b/internal/api/handler/est.go index a4f8686..207ffe4 100644 --- a/internal/api/handler/est.go +++ b/internal/api/handler/est.go @@ -100,13 +100,13 @@ type ESTHandler struct { // EST RFC 7030 hardening Phase 3.3: per-handler source-IP rate // limiter for FAILED HTTP Basic auth attempts. Keyed by sourceIP so // a hostile network segment can't burn through the password. - failedBasicLimiter *ratelimit.SlidingWindowLimiter + failedBasicLimiter ratelimit.Limiter // EST RFC 7030 hardening Phase 4.2: per-handler per-principal sliding- // window rate limit. Keyed by (CSR-CN, sourceIP) so a stolen // bootstrap cert AND a known device CN can't be used to flood the // issuer. Disabled when nil; configured per-profile. - perPrincipalLimiter *ratelimit.SlidingWindowLimiter + perPrincipalLimiter ratelimit.Limiter // labelForLog gives observability code a per-profile string to // include in audit log lines / Prometheus labels. Defaults to @@ -170,7 +170,7 @@ func (h *ESTHandler) SetEnrollmentPassword(pw string) { h.basicPassword = pw } // rate limiter. Phase 3.3. Disabled when nil — but Validate() at // startup refuses an enabled basic-auth profile without a configured // limiter, so a real deploy always wires one. -func (h *ESTHandler) SetSourceIPRateLimiter(l *ratelimit.SlidingWindowLimiter) { +func (h *ESTHandler) SetSourceIPRateLimiter(l ratelimit.Limiter) { h.failedBasicLimiter = l } @@ -179,7 +179,7 @@ func (h *ESTHandler) SetSourceIPRateLimiter(l *ratelimit.SlidingWindowLimiter) { // every successful enrollment, NOT just failures — the goal is to // bound enrollment-flooding from a compromised credential, not just // failed-auth brute force. -func (h *ESTHandler) SetPerPrincipalRateLimiter(l *ratelimit.SlidingWindowLimiter) { +func (h *ESTHandler) SetPerPrincipalRateLimiter(l ratelimit.Limiter) { h.perPrincipalLimiter = l } diff --git a/internal/api/handler/export.go b/internal/api/handler/export.go index d6fc481..1575e1f 100644 --- a/internal/api/handler/export.go +++ b/internal/api/handler/export.go @@ -28,7 +28,7 @@ type ExportService interface { // ExportHandler handles HTTP requests for certificate export operations. type ExportHandler struct { svc ExportService - exportLimiter *ratelimit.SlidingWindowLimiter // production hardening II Phase 3 + exportLimiter ratelimit.Limiter // production hardening II Phase 3 } // NewExportHandler creates a new ExportHandler with a service dependency. @@ -40,7 +40,7 @@ func NewExportHandler(svc ExportService) ExportHandler { // Production hardening II Phase 3. Default cap (when set in // cmd/server/main.go): 50 exports/hr/operator. Setting to nil // disables the limit. -func (h *ExportHandler) SetExportRateLimiter(l *ratelimit.SlidingWindowLimiter) { +func (h *ExportHandler) SetExportRateLimiter(l ratelimit.Limiter) { h.exportLimiter = l } diff --git a/internal/config/config.go b/internal/config/config.go index ec9ab6d..5e02dd4 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -441,11 +441,13 @@ func Load() (*Config, error) { }, }, RateLimit: RateLimitConfig{ - Enabled: getEnvBool("CERTCTL_RATE_LIMIT_ENABLED", true), - RPS: getEnvFloat("CERTCTL_RATE_LIMIT_RPS", 50), - BurstSize: getEnvInt("CERTCTL_RATE_LIMIT_BURST", 100), - PerUserRPS: getEnvFloat("CERTCTL_RATE_LIMIT_PER_USER_RPS", 0), - PerUserBurstSize: getEnvInt("CERTCTL_RATE_LIMIT_PER_USER_BURST", 0), + Enabled: getEnvBool("CERTCTL_RATE_LIMIT_ENABLED", true), + RPS: getEnvFloat("CERTCTL_RATE_LIMIT_RPS", 50), + BurstSize: getEnvInt("CERTCTL_RATE_LIMIT_BURST", 100), + PerUserRPS: getEnvFloat("CERTCTL_RATE_LIMIT_PER_USER_RPS", 0), + PerUserBurstSize: getEnvInt("CERTCTL_RATE_LIMIT_PER_USER_BURST", 0), + SlidingWindowBackend: getEnv("CERTCTL_RATE_LIMIT_BACKEND", "memory"), + SlidingWindowJanitorInterval: getEnvDuration("CERTCTL_RATE_LIMIT_JANITOR_INTERVAL", 5*time.Minute), }, CORS: CORSConfig{ AllowedOrigins: getEnvList("CERTCTL_CORS_ORIGINS", nil), @@ -764,6 +766,36 @@ func (c *Config) Validate() error { ) } + // Phase 13 Sprint 13.3 closure (ARCH-M1): validate + // CERTCTL_RATE_LIMIT_BACKEND is one of the two supported values. + // Fail-closed on any other input so a typo doesn't silently fall + // back to the wrong backend (the operator picked "postgress" and + // got memory rate-limits in a 3-replica cluster). + switch c.RateLimit.SlidingWindowBackend { + case "", "memory", "postgres": + // "" is treated as "memory" — test-built Configs (which + // construct the struct literal directly without going + // through Load()) don't get the default; Load() always + // fills "memory". Either path lands the runtime on the + // in-memory backend. + default: + return fmt.Errorf( + "invalid CERTCTL_RATE_LIMIT_BACKEND=%q — refuse to start: must be \"memory\" (default, per-process limits; for single-replica deploys) or \"postgres\" (cross-replica-consistent via the rate_limit_buckets table; required for HA deploys). See docs/operator/observability.md.", + c.RateLimit.SlidingWindowBackend, + ) + } + // Janitor interval lower bound — 1 minute. Below this the sweep + // cost outweighs the row-cleanup benefit; above this still + // matches the operator's bound (5 minutes default; can be raised + // indefinitely). + if c.RateLimit.SlidingWindowJanitorInterval > 0 && + c.RateLimit.SlidingWindowJanitorInterval < time.Minute { + return fmt.Errorf( + "invalid CERTCTL_RATE_LIMIT_JANITOR_INTERVAL=%v — refuse to start: must be ≥ 1 minute (default 5m).", + c.RateLimit.SlidingWindowJanitorInterval, + ) + } + // Validate database configuration if c.Database.URL == "" { return fmt.Errorf("database URL is required") diff --git a/internal/config/server.go b/internal/config/server.go index 2aede91..085760e 100644 --- a/internal/config/server.go +++ b/internal/config/server.go @@ -321,6 +321,46 @@ type RateLimitConfig struct { // zero, BurstSize is used. Default: 0 (use BurstSize). // Setting: CERTCTL_RATE_LIMIT_PER_USER_BURST environment variable. PerUserBurstSize int + + // SlidingWindowBackend selects which backend implements the + // per-key sliding-window-log limiters wired in cmd/server/main.go + // (break-glass login, OCSP per-IP, cert-export per-actor, EST + // per-principal, EST failed-basic source-IP). Distinct from the + // token-bucket fields above — those are middleware RPS limits + // applied across every request via the http handler chain; this + // field controls the sliding-window-log primitive used by + // authenticated-but-shared-credential code paths. + // + // Valid values: + // "memory" — per-process, sync.Mutex-guarded map (historical + // default; perfect for single-replica deploys). + // "postgres" — cross-replica-consistent via the + // rate_limit_buckets table (migration 000046). + // SELECT FOR UPDATE arbitrates per-key access + // across the cluster. Adds ~2 DB round-trips per + // Allow call; acceptable on the gated hot path. + // + // Default: "memory". HA deploys with server.replicas > 1 should + // flip to "postgres" so a 2-replica deployment doesn't effectively + // double the per-key cap. + // + // Phase 13 Sprint 13.2/13.3 closure (architecture diligence audit + // ARCH-M1). See docs/operator/observability.md. + // + // Setting: CERTCTL_RATE_LIMIT_BACKEND environment variable. + SlidingWindowBackend string + + // SlidingWindowJanitorInterval is how often the scheduler sweeps + // stale rows from rate_limit_buckets. A row is stale when its + // updated_at is older than the longest configured window any + // caller uses (currently 24h for the EST per-principal limiter). + // Default: 5 minutes. Minimum: 1 minute. No-op when + // SlidingWindowBackend = "memory" (the in-memory backend's + // prune-on-Allow path keeps buckets short-lived without a + // separate sweep). + // + // Setting: CERTCTL_RATE_LIMIT_JANITOR_INTERVAL environment variable. + SlidingWindowJanitorInterval time.Duration } // CORSConfig contains CORS configuration. diff --git a/internal/ratelimit/factory.go b/internal/ratelimit/factory.go new file mode 100644 index 0000000..b522432 --- /dev/null +++ b/internal/ratelimit/factory.go @@ -0,0 +1,65 @@ +// Copyright 2026 certctl LLC. All rights reserved. +// SPDX-License-Identifier: BUSL-1.1 + +package ratelimit + +import ( + "database/sql" + "fmt" + "time" +) + +// Phase 13 Sprint 13.3 (2026-05-14, architecture diligence audit +// ARCH-M1): the backend-selector factory. Wires every +// `ratelimit.NewSlidingWindowLimiter(...)` call site in +// cmd/server/main.go through here so the operator-chosen backend +// (CERTCTL_RATE_LIMIT_BACKEND={memory,postgres}) gates the limiter +// type without each call site replicating the switch. +// +// Caller-visible behavior contract: NewLimiter(backend="memory", ...) +// returns a *SlidingWindowLimiter identical to a direct +// NewSlidingWindowLimiter call. NewLimiter(backend="postgres", ...) +// returns a *PostgresSlidingWindowLimiter with the same Allow(key, now) +// signature + the same ErrRateLimited sentinel + the same maxN<=0 +// disabled semantics. Sprint 13.3's "no signature change" rule is +// what makes the swap drop-in. +// +// The mapCap argument is the in-memory backend's per-instance +// key-cap (LRU-evicted under pressure). Postgres backend has no +// equivalent — the table grows until the scheduler janitor sweeps +// stale rows; mapCap is accepted + ignored for that backend so the +// factory signature stays drop-in identical to NewSlidingWindowLimiter. + +// NewLimiter returns a Limiter backed by either the in-memory +// SlidingWindowLimiter (backend="memory") or the +// PostgresSlidingWindowLimiter (backend="postgres"). +// +// `backend` is validated by config.Validate() at startup; any other +// value here panics — config validation is the SoT, this is just +// defensive in case the call site somehow bypasses startup +// validation. +// +// `db` is required when backend="postgres" and ignored when +// backend="memory". The factory does not nil-check db for the +// memory branch because requiring a meaningful db handle for the +// memory path would couple every limiter call site to the database +// pool unnecessarily. +// +// `maxN <= 0` disables the limiter (both backends honor the +// opt-out — all Allow calls return nil). +func NewLimiter(backend string, db *sql.DB, maxN int, window time.Duration, mapCap int) Limiter { + switch backend { + case "memory": + return NewSlidingWindowLimiter(maxN, window, mapCap) + case "postgres": + if db == nil { + panic("ratelimit.NewLimiter: backend=postgres requires a non-nil *sql.DB (config.Validate should have caught this earlier)") + } + return NewPostgresSlidingWindowLimiter(db, maxN, window) + default: + // Defensive — config.Validate() rejects anything else at + // startup. Reaching this branch implies a coding error in a + // future call site that bypasses validation. + panic(fmt.Sprintf("ratelimit.NewLimiter: unknown backend %q (must be memory or postgres)", backend)) + } +} diff --git a/internal/ratelimit/postgres_gc.go b/internal/ratelimit/postgres_gc.go new file mode 100644 index 0000000..23f49ae --- /dev/null +++ b/internal/ratelimit/postgres_gc.go @@ -0,0 +1,71 @@ +// Copyright 2026 certctl LLC. All rights reserved. +// SPDX-License-Identifier: BUSL-1.1 + +package ratelimit + +import ( + "context" + "database/sql" + "fmt" + "time" +) + +// Phase 13 Sprint 13.3 closure (2026-05-14, architecture diligence audit +// ARCH-M1): the scheduler-invoked janitor for the postgres-backed +// rate-limit bucket table. Sweeps rows whose updated_at is older than +// the longest configured window any caller uses — these rows can +// never be at-cap (every timestamp inside has aged past the window), +// so dropping them entirely is safe. +// +// The in-memory backend's prune-on-Allow path keeps buckets short- +// lived without a separate sweep; this file is postgres-only. + +// PostgresGC drives the rate_limit_buckets sweep. Constructed from the +// same *sql.DB the limiters use; the scheduler holds it as a value +// satisfying the ratelimit.GarbageCollector interface (mirrors the +// shape of acme.GarbageCollector + sessions.GarbageCollector). +type PostgresGC struct { + db *sql.DB + maxWindow time.Duration +} + +// NewPostgresGC returns a janitor that sweeps rows whose updated_at +// is older than `maxWindow` ago. Pass the longest window any caller +// in the deployment configures (the EST per-principal limiter uses +// 24h today; bump if a new caller introduces a longer window). +// +// maxWindow <= 0 disables the sweep — GarbageCollect becomes a +// no-op. Operator opt-out for sketchpad / single-replica deploys +// that still want the postgres backend (rare; the memory backend is +// the better fit). +func NewPostgresGC(db *sql.DB, maxWindow time.Duration) *PostgresGC { + return &PostgresGC{db: db, maxWindow: maxWindow} +} + +// GarbageCollect deletes every rate_limit_buckets row whose +// updated_at is older than now-maxWindow. Returns the number of +// rows deleted + any error from the DELETE. +// +// Single statement, single round-trip — operates on the +// rate_limit_buckets_updated_at_idx index introduced in migration +// 000046. Idempotent: repeated calls find 0 rows. +func (g *PostgresGC) GarbageCollect(ctx context.Context) (int64, error) { + if g.maxWindow <= 0 { + return 0, nil + } + cutoff := time.Now().Add(-g.maxWindow) + res, err := g.db.ExecContext(ctx, ` + DELETE FROM rate_limit_buckets + WHERE updated_at < $1 + `, cutoff) + if err != nil { + return 0, fmt.Errorf("ratelimit-gc: delete stale buckets: %w", err) + } + n, err := res.RowsAffected() + if err != nil { + // Driver doesn't expose RowsAffected; rare. Don't fail the + // sweep — the delete already ran. + return 0, nil + } + return n, nil +} diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 6f44d66..703d36c 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -103,6 +103,21 @@ type BCLReplayGarbageCollector interface { SweepExpired(ctx context.Context, now time.Time) (int, error) } +// RateLimitGarbageCollector sweeps stale rows from the +// rate_limit_buckets table introduced in migration 000046. Phase 13 +// Sprint 13.3 (ARCH-M1 closure completion) — wired only when +// CERTCTL_RATE_LIMIT_BACKEND=postgres. Concrete impl is +// *ratelimit.PostgresGC. Mirrors the ACMEGarbageCollector + +// SessionGarbageCollector contracts so the scheduler reuses the same +// atomic.Bool + WithTimeout + ticker pattern as the existing GC loops. +// +// Returns the row count to surface via observability logs (matches +// SessionGarbageCollector's shape — the operator wants to see +// "how many buckets did the sweep delete" in steady-state monitoring). +type RateLimitGarbageCollector interface { + GarbageCollect(ctx context.Context) (int64, error) +} + // JobReaperService defines the interface for job timeout reaping used by the scheduler. type JobReaperService interface { ReapTimedOutJobs(ctx context.Context, csrTTL, approvalTTL time.Duration) error @@ -130,6 +145,7 @@ type Scheduler struct { acmeGC ACMEGarbageCollector sessionGC SessionGarbageCollector bclReplayGC BCLReplayGarbageCollector + rateLimitGC RateLimitGarbageCollector jobReaper JobReaperService logger *slog.Logger @@ -149,6 +165,7 @@ type Scheduler struct { jobTimeoutInterval time.Duration acmeGCInterval time.Duration sessionGCInterval time.Duration + rateLimitGCInterval time.Duration // agentOfflineJobTTL: per-tick threshold for reaping Running jobs whose // owning agent has been silent. Bundle C / Audit M-016. Defaults below. agentOfflineJobTTL time.Duration @@ -171,6 +188,7 @@ type Scheduler struct { jobTimeoutRunning atomic.Bool acmeGCRunning atomic.Bool sessionGCRunning atomic.Bool + rateLimitGCRunning atomic.Bool // Graceful shutdown: wait for in-flight work to complete wg sync.WaitGroup @@ -209,6 +227,7 @@ func NewScheduler( jobTimeoutInterval: 10 * time.Minute, acmeGCInterval: 1 * time.Minute, sessionGCInterval: 1 * time.Hour, + rateLimitGCInterval: 5 * time.Minute, // 5 minutes is 5×agentHealthCheckInterval default of 1m; an agent // must miss multiple heartbeats before its in-flight jobs are reaped. agentOfflineJobTTL: 5 * time.Minute, @@ -365,6 +384,29 @@ func (s *Scheduler) SetSessionGCInterval(d time.Duration) { s.sessionGCInterval = d } +// SetRateLimitGarbageCollector wires the Phase 13 Sprint 13.3 rate- +// limit bucket GC. Optional; nil disables the loop (which is the +// correct behavior when CERTCTL_RATE_LIMIT_BACKEND=memory — the +// in-memory backend's prune-on-Allow path keeps buckets short-lived +// without a separate sweep). +// +// Concrete impl is *ratelimit.PostgresGC, constructed in +// cmd/server/main.go only when the postgres backend is selected. +func (s *Scheduler) SetRateLimitGarbageCollector(gc RateLimitGarbageCollector) { + s.rateLimitGC = gc +} + +// SetRateLimitGCInterval configures the interval at which the rate- +// limit GC sweep runs. Default 5m. Wire: +// CERTCTL_RATE_LIMIT_JANITOR_INTERVAL. Zero or negative values are +// ignored. +func (s *Scheduler) SetRateLimitGCInterval(d time.Duration) { + if d <= 0 { + return + } + s.rateLimitGCInterval = d +} + // SetAgentOfflineJobTTL sets the threshold past which a Running job whose // owning agent has gone silent is reaped to Failed. Bundle C / Audit M-016. // Zero or negative values are ignored (the default of 5 minutes is kept). @@ -426,6 +468,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} { if s.sessionGC != nil { loopCount++ } + if s.rateLimitGC != nil { + loopCount++ + } s.wg.Add(loopCount) go func() { defer s.wg.Done(); s.renewalCheckLoop(ctx) }() @@ -457,6 +502,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} { if s.sessionGC != nil { go func() { defer s.wg.Done(); s.sessionGCLoop(ctx) }() } + if s.rateLimitGC != nil { + go func() { defer s.wg.Done(); s.rateLimitGCLoop(ctx) }() + } // Signal that all loops are launched close(startedChan) @@ -1247,3 +1295,45 @@ func (s *Scheduler) sessionGCLoop(ctx context.Context) { } } } + +// rateLimitGCLoop runs every rateLimitGCInterval and invokes +// RateLimitGarbageCollector.GarbageCollect, which sweeps stale rows +// from the rate_limit_buckets table introduced in Phase 13 Sprint +// 13.2's migration 000046. +// +// Wired only when CERTCTL_RATE_LIMIT_BACKEND=postgres (the in-memory +// backend's prune-on-Allow path keeps buckets short-lived without a +// separate sweep — cmd/server/main.go skips SetRateLimitGarbageCollector +// for that case so this loop never launches). +// +// Phase 13 Sprint 13.3 closure. The atomic.Bool guard + per-tick +// context.WithTimeout match every other GC loop's pattern. +func (s *Scheduler) rateLimitGCLoop(ctx context.Context) { + ticker := NewJitteredTicker(s.rateLimitGCInterval, DefaultSchedulerJitter) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if !s.rateLimitGCRunning.CompareAndSwap(false, true) { + s.logger.Warn("rate-limit GC sweep still running, skipping tick") + continue + } + s.wg.Add(1) + go func() { + defer s.wg.Done() + defer s.rateLimitGCRunning.Store(false) + // 1-minute timeout matches acme + session GC loops. + opCtx, cancel := context.WithTimeout(ctx, time.Minute) + defer cancel() + if n, err := s.rateLimitGC.GarbageCollect(opCtx); err != nil { + s.logger.Warn("rate-limit gc sweep failed (next tick will retry)", "error", err) + } else if n > 0 { + s.logger.Debug("rate-limit gc swept stale buckets", "rows", n) + } + }() + } + } +}