mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 17:31:30 +00:00
bee47f0318
Closes the production-readiness loop on the ACME surface. After this
commit, certctl ships per-account rate limits + a GC sweeper for
expired ACME state + a kind-driven cert-manager 1.15 integration test
+ a lego-driven RFC conformance harness + a k6 loadtest scenario for
the unauthenticated ACME path.
Architecture:
- Rate limits live in-memory + per-replica. Restart wipes the
counters; orders/hour caps are eventual-consistency anyway. A
3-replica certctl-server fleet behind an LB effectively has 3x
the configured throughput per account; persistent rate limiting
is a follow-up if production telemetry shows abuse patterns we
can't catch in a single restart cycle. Per-key + per-action
isolation: ActionNewOrder/acc-1, ActionKeyChange/acc-1, and
ActionChallengeRespond/<challenge-id> are independent buckets.
- GC loop follows the existing scheduler-loop pattern (atomic.Bool
+ sync.WaitGroup; see crlGenerationLoop for shape). Three
independent SQL sweeps per tick (DELETE expired nonces; UPDATE
pending authzs whose expires_at < now() to expired; UPDATE
pending/ready/processing orders whose expires_at < now() to
invalid). Each sweep is a single statement; failures are logged-
and-continued so a failing nonces sweep doesn't block authzs.
Per-sweep 1m timeout bounds a stuck Postgres.
- cert-manager integration test is gated on KIND_AVAILABLE so CI
skips it cleanly (kind is too heavy for per-PR). Operators run
locally via 'make acme-cert-manager-test'; the harness brings up
a fresh cluster each run + tears it down on Cleanup.
- lego conformance harness drives a real ACME client through
register → run → cert-PEM-landed against a hermetic certctl
stack. Catches RFC-shape regressions third-party clients would
hit before they ship.
- k6 ACME-flow scenario hammers the unauthenticated surface
(directory + new-nonce + ARI synthetic-id) at 100 VUs × 5m. JWS-
signed flows are out of scope for k6 (no JWS support); they're
covered by the lego harness above.
What ships:
- internal/api/acme/ratelimit.go (+ ratelimit_test.go: 7 cases —
disable-when-perHour-zero, capacity, per-key isolation, per-
action isolation, refill-over-time, RetryAfter, concurrent-access
with -race + 200 goroutines × 200 calls).
- internal/repository/postgres/acme.go: 4 new methods —
CountActiveOrdersByAccount + GCExpiredNonces + GCExpireAuthorizations
+ GCInvalidateExpiredOrders. Each a single SQL statement.
- internal/service/acme.go: SetRateLimiter + GarbageCollect +
rate-limit gates at 3 entry points (CreateOrder + RotateAccountKey
+ RespondToChallenge) + concurrent-orders gate at CreateOrder.
2 new sentinels (ErrACMERateLimited, ErrACMEConcurrentOrdersExceeded);
5 new GC metrics (gc_runs / gc_run_failures / gc_nonces_reaped /
gc_authzs_expired / gc_orders_invalidated).
- internal/scheduler/scheduler.go: ACMEGarbageCollector interface +
acmeGCRunning atomic.Bool + acmeGCInterval + 2 setters (SetACME-
GarbageCollector + SetACMEGCInterval) + acmeGCLoop following the
crlGenerationLoop shape.
- internal/api/handler/acme.go: writeServiceError gains rateLimited
(429 + RFC 8555 §6.7) + concurrent-orders-exceeded mappings.
- internal/config/config.go: 5 new env vars
(CERTCTL_ACME_SERVER_RATE_LIMIT_ORDERS_PER_HOUR=100,
CERTCTL_ACME_SERVER_RATE_LIMIT_CONCURRENT_ORDERS=5,
CERTCTL_ACME_SERVER_RATE_LIMIT_KEY_CHANGE_PER_HOUR=5,
CERTCTL_ACME_SERVER_RATE_LIMIT_CHALLENGE_RESPONDS_PER_HOUR=60,
CERTCTL_ACME_SERVER_GC_INTERVAL=1m).
- cmd/server/main.go: NewRateLimiter() + SetRateLimiter() at
startup; conditional SetACMEGarbageCollector(acmeService) +
SetACMEGCInterval(cfg.ACMEServer.GCInterval) when Enabled+
GCInterval > 0.
- deploy/test/acme-integration/: kind-config.yaml + cert-manager-
install.sh + clusterissuer-trust-authenticated.yaml +
clusterissuer-challenge.yaml + certificate-test.yaml + conformance-
lego.sh + certmanager_test.go (//go:build integration + KIND_AVAILABLE
gate).
- deploy/test/loadtest/k6/acme_flow.js + README ACME-flows section.
- Makefile: 2 new PHONY targets (acme-cert-manager-test +
acme-rfc-conformance-test).
- docs/acme-server.md: status flipped to Phase 5; Configuration
table grows 5 rows; new 'Phase 5 — operational guidance' section
explaining rate-limit math + GC sweeper semantics + cert-manager
integration + lego conformance + k6 baseline.
Tests:
- 'go vet ./...' clean across the repo.
- 'go test -short -count=1 ./internal/...' green across every
affected package (service / acme / handler / scheduler / repo /
config).
- 'go vet -tags=integration ./deploy/test/acme-integration/' clean
(the integration test compiles cleanly with the build tag).
- The kind/cert-manager harness is gated behind KIND_AVAILABLE so
CI skips by default; operators run locally via 'make acme-cert-
manager-test'.
Engineering history: cowork/WORKSPACE-CHANGELOG.md 'ACME-Server-5'.
160 lines
4.5 KiB
Go
160 lines
4.5 KiB
Go
// Copyright (c) certctl
|
||
// SPDX-License-Identifier: BSL-1.1
|
||
|
||
package acme
|
||
|
||
import (
|
||
"sync"
|
||
"testing"
|
||
"time"
|
||
)
|
||
|
||
// Phase 5 — RateLimiter unit tests.
|
||
|
||
func TestRateLimiter_DisabledWhenPerHourZero(t *testing.T) {
|
||
r := NewRateLimiter()
|
||
for i := 0; i < 10000; i++ {
|
||
if !r.Allow(ActionNewOrder, "acc-1", 0) {
|
||
t.Fatalf("Allow returned false on call %d with perHour=0", i)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestRateLimiter_DisabledWhenPerHourNegative(t *testing.T) {
|
||
r := NewRateLimiter()
|
||
if !r.Allow(ActionNewOrder, "acc-1", -5) {
|
||
t.Errorf("Allow returned false with perHour=-5; expected always-allow")
|
||
}
|
||
}
|
||
|
||
func TestRateLimiter_BucketCapacity(t *testing.T) {
|
||
// Frozen clock: a fresh bucket has perHour tokens. Drain exactly
|
||
// that many; the next call must return false.
|
||
now := time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC)
|
||
r := NewRateLimiter()
|
||
r.SetClock(func() time.Time { return now })
|
||
|
||
for i := 0; i < 100; i++ {
|
||
if !r.Allow(ActionNewOrder, "acc-1", 100) {
|
||
t.Fatalf("Allow returned false on call %d (within capacity)", i)
|
||
}
|
||
}
|
||
if r.Allow(ActionNewOrder, "acc-1", 100) {
|
||
t.Errorf("Allow returned true on the 101st call; expected limit hit")
|
||
}
|
||
}
|
||
|
||
func TestRateLimiter_PerKeyIsolation(t *testing.T) {
|
||
// Frozen clock — drain acc-1 to zero, then acc-2 should still have
|
||
// a full bucket (separate key).
|
||
now := time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC)
|
||
r := NewRateLimiter()
|
||
r.SetClock(func() time.Time { return now })
|
||
|
||
for i := 0; i < 100; i++ {
|
||
_ = r.Allow(ActionNewOrder, "acc-1", 100)
|
||
}
|
||
if r.Allow(ActionNewOrder, "acc-1", 100) {
|
||
t.Errorf("acc-1 should be rate-limited")
|
||
}
|
||
if !r.Allow(ActionNewOrder, "acc-2", 100) {
|
||
t.Errorf("acc-2 should be unaffected by acc-1's bucket; expected allow")
|
||
}
|
||
}
|
||
|
||
func TestRateLimiter_PerActionIsolation(t *testing.T) {
|
||
// Same key but different actions get different buckets.
|
||
now := time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC)
|
||
r := NewRateLimiter()
|
||
r.SetClock(func() time.Time { return now })
|
||
|
||
for i := 0; i < 5; i++ {
|
||
_ = r.Allow(ActionKeyChange, "acc-1", 5)
|
||
}
|
||
if r.Allow(ActionKeyChange, "acc-1", 5) {
|
||
t.Errorf("ActionKeyChange should be rate-limited")
|
||
}
|
||
// ActionNewOrder for the same key has its own (empty) bucket.
|
||
if !r.Allow(ActionNewOrder, "acc-1", 100) {
|
||
t.Errorf("ActionNewOrder for same key should be allowed (different bucket)")
|
||
}
|
||
}
|
||
|
||
func TestRateLimiter_RefillOverTime(t *testing.T) {
|
||
// Drain bucket; advance the clock; expect tokens replenished.
|
||
current := time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC)
|
||
r := NewRateLimiter()
|
||
r.SetClock(func() time.Time { return current })
|
||
|
||
for i := 0; i < 100; i++ {
|
||
_ = r.Allow(ActionNewOrder, "acc-1", 100)
|
||
}
|
||
if r.Allow(ActionNewOrder, "acc-1", 100) {
|
||
t.Fatalf("expected limit hit after draining bucket")
|
||
}
|
||
// Advance by 36 seconds: at 100/hour = 100/3600 tokens/sec ≈
|
||
// 0.0278/sec. 36 * 0.0278 = 1.00 tokens — exactly enough for 1
|
||
// more call.
|
||
current = current.Add(36 * time.Second)
|
||
if !r.Allow(ActionNewOrder, "acc-1", 100) {
|
||
t.Errorf("Allow returned false after 36s elapsed; expected ≥1 token replenished")
|
||
}
|
||
}
|
||
|
||
func TestRateLimiter_RetryAfter(t *testing.T) {
|
||
now := time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC)
|
||
r := NewRateLimiter()
|
||
r.SetClock(func() time.Time { return now })
|
||
|
||
// Drain to zero.
|
||
for i := 0; i < 100; i++ {
|
||
_ = r.Allow(ActionNewOrder, "acc-1", 100)
|
||
}
|
||
d := r.RetryAfter(ActionNewOrder, "acc-1", 100)
|
||
// 1 token at 100/hour = 36 seconds.
|
||
if d < 35*time.Second || d > 37*time.Second {
|
||
t.Errorf("RetryAfter = %v, expected ~36s", d)
|
||
}
|
||
// Allow above capacity — RetryAfter returns 0 on a fresh bucket.
|
||
if zero := r.RetryAfter(ActionNewOrder, "acc-fresh", 100); zero != 0 {
|
||
t.Errorf("RetryAfter for fresh bucket = %v, expected 0", zero)
|
||
}
|
||
}
|
||
|
||
func TestRateLimiter_ConcurrentAccess(t *testing.T) {
|
||
// Hammer 200 goroutines × 200 calls each = 40000 calls against a
|
||
// 1000-token bucket; assert no panic, no data race (run with -race),
|
||
// and that no more than 1000 calls succeeded.
|
||
now := time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC)
|
||
r := NewRateLimiter()
|
||
r.SetClock(func() time.Time { return now })
|
||
|
||
var (
|
||
wg sync.WaitGroup
|
||
success int64
|
||
mu sync.Mutex
|
||
)
|
||
for g := 0; g < 200; g++ {
|
||
wg.Add(1)
|
||
go func() {
|
||
defer wg.Done()
|
||
local := int64(0)
|
||
for i := 0; i < 200; i++ {
|
||
if r.Allow(ActionNewOrder, "shared-acc", 1000) {
|
||
local++
|
||
}
|
||
}
|
||
mu.Lock()
|
||
success += local
|
||
mu.Unlock()
|
||
}()
|
||
}
|
||
wg.Wait()
|
||
if success > 1000 {
|
||
t.Errorf("got %d successes, want ≤ 1000 (bucket capacity)", success)
|
||
}
|
||
if success < 1000 {
|
||
t.Errorf("got %d successes, want exactly 1000 (frozen clock, no refill)", success)
|
||
}
|
||
}
|