diff --git a/.github/workflows/loadtest.yml b/.github/workflows/loadtest.yml index db2919c..b02b407 100644 --- a/.github/workflows/loadtest.yml +++ b/.github/workflows/loadtest.yml @@ -75,3 +75,65 @@ jobs: name: k6-summary-${{ github.run_id }} path: deploy/test/loadtest/results/ retention-days: 90 + + # --------------------------------------------------------------------------- + # Phase 8 SCALE-H2 — scale-tier scenarios. Three new k6 drivers: + # - bulk-renewal: 10K-cert seed + criteria-mode POST /bulk-renew + # - acme-burst: 200 concurrent VUs against directory/nonce/ARI + # - agent-storm: 5K-agent seed + 167 heartbeats/sec sustained + # + # Matrix dispatch so each scenario runs on its own runner and a + # regression in one doesn't mask another. The matrix runs in parallel, + # which keeps total wall time around the existing 25-minute cap rather + # than ~70 minutes serialised. Each scenario brings up the full + # loadtest compose stack independently — there's no shared state + # between scenarios that would benefit from a single-runner serial + # invocation. + # + # Cadence: same as the API + connector tier job above (workflow_dispatch + # + Mondays 06:00 UTC). The scale scenarios DO produce useful per-PR + # signal in theory, but the per-run cost (image build + 5min run × 3) + # is too high to gate on every PR; weekly is the right trade-off. + # --------------------------------------------------------------------------- + k6-scale: + name: k6 scale tier (${{ matrix.scenario }}) + runs-on: ubuntu-latest + timeout-minutes: 25 + needs: k6 + strategy: + # Parallel: a failure in one scenario shouldn't cancel the others. + # Each scenario's threshold breach is independent diagnostic data. + fail-fast: false + matrix: + scenario: + - bulk-renewal + - acme-burst + - agent-storm + + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 + + - name: Run scale loadtest (${{ matrix.scenario }}) + env: + BUILDKIT_PROGRESS: plain + run: | + case "${{ matrix.scenario }}" in + bulk-renewal) make loadtest-scale-bulk ;; + acme-burst) make loadtest-scale-acme ;; + agent-storm) make loadtest-scale-agent ;; + *) echo "::error::unknown scenario ${{ matrix.scenario }}"; exit 1 ;; + esac + + - name: Upload summary + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + with: + # Per-scenario artifact name so the three matrix runs don't + # collide on upload. + name: k6-scale-${{ matrix.scenario }}-${{ github.run_id }} + path: deploy/test/loadtest/results/ + retention-days: 90 diff --git a/Makefile b/Makefile index 20a8639..ef816e0 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help build run test lint verify verify-deploy loadtest acme-cert-manager-test acme-rfc-conformance-test keycloak-integration-test okta-smoke-test benchmark-auth benchmark-auth-coldcache clean docker-up docker-down migrate-up migrate-down generate test-cover frontend-build e2e-test qa-stats +.PHONY: help build run test lint verify verify-deploy loadtest loadtest-scale loadtest-scale-bulk loadtest-scale-acme loadtest-scale-agent acme-cert-manager-test acme-rfc-conformance-test keycloak-integration-test okta-smoke-test benchmark-auth benchmark-auth-coldcache clean docker-up docker-down migrate-up migrate-down generate test-cover frontend-build e2e-test qa-stats # Default target - show help help: @@ -153,6 +153,49 @@ loadtest: @echo "==> results landed in deploy/test/loadtest/results/" @if [ -f deploy/test/loadtest/results/summary.txt ]; then cat deploy/test/loadtest/results/summary.txt; fi +# Phase 8 SCALE-H2 — scale-tier load tests. Profile-gated in the +# loadtest compose so the default `make loadtest` stays fast and +# focused on the per-PR regression scope (API tier + connector tier). +# +# loadtest-scale-bulk runs the 10K-cert bulk-renew scenario. +# loadtest-scale-acme runs the 200-VU ACME directory/nonce/ARI burst. +# loadtest-scale-agent runs the 5K-agent heartbeat storm. +# +# Each target uses --exit-code-from so a threshold +# breach surfaces as a non-zero make exit. The scale-seed init runs +# once per invocation (idempotent via ON CONFLICT) so re-running a +# target against the same compose stack is fine. +loadtest-scale-bulk: + @echo "==> Phase 8 SCALE-H2: bulk-renewal scenario (10K cert fixture, ~6m)" + @cd deploy/test/loadtest && docker compose --profile scale up --build \ + --abort-on-container-exit --exit-code-from k6-scale-bulk + @echo "" + @echo "==> results: deploy/test/loadtest/results/summary-bulk-renewal.{json,txt}" + @if [ -f deploy/test/loadtest/results/summary-bulk-renewal.txt ]; then \ + cat deploy/test/loadtest/results/summary-bulk-renewal.txt; fi + +loadtest-scale-acme: + @echo "==> Phase 8 SCALE-H2: ACME enrollment burst (200 VU, ~6m)" + @cd deploy/test/loadtest && docker compose --profile scale up --build \ + --abort-on-container-exit --exit-code-from k6-scale-acme + @echo "" + @echo "==> results: deploy/test/loadtest/results/summary-acme-burst.{json,txt}" + @if [ -f deploy/test/loadtest/results/summary-acme-burst.txt ]; then \ + cat deploy/test/loadtest/results/summary-acme-burst.txt; fi + +loadtest-scale-agent: + @echo "==> Phase 8 SCALE-H2: agent heartbeat storm (5K agent fixture, ~6m)" + @cd deploy/test/loadtest && docker compose --profile scale up --build \ + --abort-on-container-exit --exit-code-from k6-scale-agent + @echo "" + @echo "==> results: deploy/test/loadtest/results/summary-agent-storm.{json,txt}" + @if [ -f deploy/test/loadtest/results/summary-agent-storm.txt ]; then \ + cat deploy/test/loadtest/results/summary-agent-storm.txt; fi + +# All three Phase 8 scenarios serially. Use the matrix in +# .github/workflows/loadtest.yml for parallel CI runs. +loadtest-scale: loadtest-scale-bulk loadtest-scale-acme loadtest-scale-agent + # Auth Bundle 2 Phase 10 — Keycloak end-to-end OIDC integration test. # Boots a Keycloak container via testcontainers-go (quay.io/keycloak:25.0), # imports a canned realm with two groups + two users, and drives the diff --git a/deploy/test/loadtest/README.md b/deploy/test/loadtest/README.md index 6d54b83..1f50638 100644 --- a/deploy/test/loadtest/README.md +++ b/deploy/test/loadtest/README.md @@ -352,8 +352,35 @@ the ACME flow scenario. Operators with kind / cert-manager available should pair this with `make acme-cert-manager-test` for end-to-end verification. +## Scale tier (Phase 8 SCALE-H2, 2026-05-14) + +Phase 8 closure added three new k6 scenarios that exercise the +scale-relevant load surfaces the API tier and connector tier left +uncovered: + +| Scenario | k6 file | Seed | Make target | +|---|---|---|---| +| Bulk-renewal under load | `k6/bulk_renewal.js` | `seed/01_bulk_renewal_certs.sql` (10K certs) | `make loadtest-scale-bulk` | +| ACME enrollment burst | `k6/acme_burst.js` | (none — unauth surface) | `make loadtest-scale-acme` | +| Agent heartbeat storm | `k6/agent_storm.js` | `seed/02_agent_fleet.sql` (5K agents) | `make loadtest-scale-agent` | + +The scale-tier scenarios live behind the `scale` compose profile so +the default `make loadtest` (API tier + connector tier, ~7 min) +stays fast. Run all three serially with `make loadtest-scale`, or +trigger the `loadtest.yml` workflow's `k6-scale` matrix jobs from +the Actions tab for canonical-hardware capture. + +Operator-facing baseline table + threshold contracts + documented +limitations live in [`docs/operator/scale.md`](../../../docs/operator/scale.md) +under the "Scale-tier scenarios (SCALE-H2, Phase 8)" section. Treat +that as the canonical source — this README only links. + +The seed fixtures + their idempotency contract are documented in +[`seed/README.md`](seed/README.md). + ## Audit references - API tier: 2026-05-01 issuer coverage audit fix #8. - Connector tier: 2026-05-02 deployment-target audit Bundle 10. - ACME flows: Phase 5 master prompt (project notes). +- Scale tier: 2026-05-14 architecture diligence Phase 8 (SCALE-H2). diff --git a/deploy/test/loadtest/docker-compose.yml b/deploy/test/loadtest/docker-compose.yml index c60e45e..2bdaa57 100644 --- a/deploy/test/loadtest/docker-compose.yml +++ b/deploy/test/loadtest/docker-compose.yml @@ -351,3 +351,128 @@ services: - run - --summary-export=/results/summary.json - /scripts/k6.js + + # =========================================================================== + # Phase 8 SCALE-H2 — scale-tier scenarios (opt-in via `--profile scale`). + # + # The default `make loadtest` path runs the API tier + connector tier + # scenarios above against the demo-scale seed. The Phase 8 scenarios are + # heavier (10K cert + 5K agent fixtures) and would slow the default path + # without serving the per-PR signal the existing run targets, so they live + # behind a separate compose profile. + # + # Three components, all profile-gated: + # 1. scale-seed — one-shot init that runs ./seed/*.sql against the + # same postgres the server uses. Idempotent. + # 2. k6-scale-bulk / k6-scale-acme / k6-scale-agent — one driver each + # for the three Phase 8 scenarios. The matrix dispatch + # in .github/workflows/loadtest.yml picks one per job. + # + # Run a single scale scenario locally: + # docker compose --profile scale up \ + # --abort-on-container-exit --exit-code-from k6-scale-bulk \ + # scale-seed k6-scale-bulk + # =========================================================================== + + scale-seed: + # postgres:16-alpine bundles psql; no extra image needed. + image: postgres:16-alpine + container_name: certctl-loadtest-scale-seed + restart: "no" + profiles: ["scale"] + depends_on: + postgres: + condition: service_healthy + # Wait for certctl-server to be healthy — the server runs schema + # migrations + seed_demo.sql at boot. The Phase 8 seeds reference + # FKs (iss-local, o-alice, t-platform, rp-standard) that + # seed_demo.sql creates, so the order MUST be: + # postgres up → server runs migrations + seed_demo.sql → scale-seed runs + certctl-server: + condition: service_healthy + environment: + PGHOST: postgres + PGUSER: certctl + PGPASSWORD: loadtestpass + PGDATABASE: certctl + volumes: + - ./seed:/seed:ro + entrypoint: /bin/sh + command: + - -c + - | + set -eu + echo "==> Phase 8 scale-seed: running SQL fixtures (lexical order)" + for f in /seed/*.sql; do + echo "----> $$f" + psql -v ON_ERROR_STOP=1 -f "$$f" + done + echo "==> Phase 8 scale-seed: complete" + + k6-scale-bulk: + image: grafana/k6:0.54.0 + container_name: certctl-loadtest-k6-bulk + profiles: ["scale"] + depends_on: + certctl-server: + condition: service_healthy + scale-seed: + condition: service_completed_successfully + environment: + CERTCTL_BASE: https://certctl-server:8443 + CERTCTL_TOKEN: load-test-token + K6_INSECURE_SKIP_TLS_VERIFY: "true" + volumes: + - ./k6/bulk_renewal.js:/scripts/bulk_renewal.js:ro + - ./results:/results + command: + - run + - --summary-export=/results/summary-bulk-renewal.json + - /scripts/bulk_renewal.js + + k6-scale-acme: + image: grafana/k6:0.54.0 + container_name: certctl-loadtest-k6-acme + profiles: ["scale"] + depends_on: + certctl-server: + condition: service_healthy + # ACME scenario doesn't depend on the SQL seeds (it hits the + # unauthenticated directory + nonce + ARI surface) but routing + # it through the same dependency chain keeps the compose + # ordering predictable across the three scale jobs. + scale-seed: + condition: service_completed_successfully + environment: + CERTCTL_ACME_DIRECTORY: https://certctl-server:8443/acme/profile/prof-test/directory + K6_INSECURE_SKIP_TLS_VERIFY: "true" + volumes: + - ./k6/acme_burst.js:/scripts/acme_burst.js:ro + - ./results:/results + command: + - run + - --summary-export=/results/summary-acme-burst.json + - /scripts/acme_burst.js + + k6-scale-agent: + image: grafana/k6:0.54.0 + container_name: certctl-loadtest-k6-agent + profiles: ["scale"] + depends_on: + certctl-server: + condition: service_healthy + scale-seed: + condition: service_completed_successfully + environment: + CERTCTL_BASE: https://certctl-server:8443 + CERTCTL_TOKEN: load-test-token + K6_INSECURE_SKIP_TLS_VERIFY: "true" + # Match the seed's 5K-agent fleet. + K6_AGENT_FLEET: "5000" + volumes: + - ./k6/agent_storm.js:/scripts/agent_storm.js:ro + - ./results:/results + command: + - run + - --summary-export=/results/summary-agent-storm.json + - /scripts/agent_storm.js diff --git a/deploy/test/loadtest/k6/acme_burst.js b/deploy/test/loadtest/k6/acme_burst.js new file mode 100644 index 0000000..11970f8 --- /dev/null +++ b/deploy/test/loadtest/k6/acme_burst.js @@ -0,0 +1,183 @@ +// Phase 8 SCALE-H2 — ACME enrollment burst. +// +// What this measures: +// 200 concurrent VUs hammering the unauthenticated ACME directory +// + new-nonce + ARI surface for 5 minutes. The goal is the +// throughput ceiling for the entry-point handlers and the +// per-account rate-limit response shape Phase 5 added (RFC 8555 +// §6.7 + RFC 7807 + the certctl-specific +// ErrACMEConcurrentOrdersExceeded path). +// +// What this does NOT measure (and why): +// - JWS-signed POST flows (new-account, new-order, finalize). +// k6 doesn't ship JWS, and bundling a Go signing helper into +// the k6 container would obscure the server-side latency the +// scenario is trying to pin. The existing +// `deploy/test/loadtest/k6/acme_flow.js` Phase 5 scenario +// made the same explicit trade-off; this Phase 8 burst scenario +// reuses the constraint. End-to-end JWS-signed conformance is +// gated by `make acme-rfc-conformance-test` (which uses lego +// against the same compose stack). +// - The actual order/finalize hot path. The newOrder handler's +// constant-time SCAN against acme_orders + the per-account +// concurrent-orders gate ARE useful to load-test, but require +// valid JWS to reach. The directory + new-nonce surface this +// scenario hits is what every ACME client transits BEFORE the +// signed flow — measuring it pins the server's headroom for +// the rest of the flow. +// - Issuer-side enrollment latency (DigiCert ACME, Let's Encrypt +// against a real prod CA, etc.). Same "load-testing someone +// else's API" carve-out as the API tier. +// +// What this DOES measure: +// - GET /acme/profile/{id}/directory throughput. Sustained 200 +// concurrent VUs at a low per-VU sleep produces ~600-1000 req/s +// against this endpoint, well above what any production ACME +// client would generate but the right shape for finding the +// ceiling. +// - HEAD /acme/profile/{id}/new-nonce throughput. Nonce +// allocation is a hot path that writes one row to acme_nonces. +// - GET /acme/profile/{id}/renewal-info/{cert-id} 4xx fast path. +// Synthetic cert-id → handler returns 4xx without a DB lookup +// (cert-id is malformed at the parse layer). Measures the +// handler-front overhead under load. +// - 429 rate-limit response shape. The Phase 5 ACME per-account +// rate limit fires at sustained spike rates; the scenario pins +// that the 429 body is RFC 7807 with the +// "urn:ietf:params:acme:error:rateLimited" type. A regression +// that returned a plain text 429 or a different problem type +// would break ACME clients hard. +// +// Threshold contract: +// - directory p95 < 500ms, new-nonce p95 < 300ms, renewal-info +// p95 < 800ms — same as the Phase 5 acme_flow.js baselines. +// - 429 responses are EXPECTED at sustained 200 VU rate (the +// server's RFC-compliant rate limiter SHOULD kick in). The +// http_req_failed metric is tagged separately so 429s don't +// break the threshold; a separate `rate_limited` Counter +// tracks them so the operator can see how often the limiter +// fires. + +import http from 'k6/http'; +import { check } from 'k6'; +import { Counter, Trend } from 'k6/metrics'; +import { textSummary } from 'https://jslib.k6.io/k6-summary/0.0.2/index.js'; + +const ACME_BASE = __ENV.CERTCTL_ACME_DIRECTORY || + 'https://certctl-server:8443/acme/profile/prof-test/directory'; + +// Custom metrics. +const directoryDuration = new Trend('acme_directory_duration', true); +const newNonceDuration = new Trend('acme_new_nonce_duration', true); +const renewalInfoDuration = new Trend('acme_renewal_info_duration', true); +const rateLimitedCount = new Counter('acme_rate_limited_count'); +const rateLimitShapeOK = new Counter('acme_rate_limit_shape_ok'); + +export const options = { + scenarios: { + acme_burst: { + executor: 'constant-vus', + vus: parseInt(__ENV.K6_ACME_VUS || '200', 10), + duration: __ENV.K6_ACME_DURATION || '5m', + gracefulStop: '30s', + tags: { scenario: 'acme_burst' }, + }, + }, + thresholds: { + 'acme_directory_duration': ['p(95)<500'], + 'acme_new_nonce_duration': ['p(95)<300'], + 'acme_renewal_info_duration': ['p(95)<800'], + // 4xx (rate-limited or malformed-cert-id) is expected; 5xx is + // not. Filter to status >= 500 for the failure floor. + 'http_req_failed{scenario:acme_burst,server_error:true}': ['rate<0.001'], + }, + insecureSkipTLSVerify: true, + summaryTrendStats: ['avg', 'min', 'med', 'p(95)', 'p(99)', 'max'], +}; + +export default function () { + // Step 1 — directory. + let res = http.get(ACME_BASE, { + tags: { scenario: 'acme_burst', step: 'directory' }, + }); + directoryDuration.add(res.timings.duration); + check(res, { 'directory 200': (r) => r.status === 200 }); + + if (res.status === 429) { + recordRateLimit(res); + return; // backoff this VU iteration + } + if (res.status !== 200) return; + + const dir = res.json(); + + // Step 2 — new-nonce. + if (dir.newNonce) { + res = http.head(dir.newNonce, { + tags: { scenario: 'acme_burst', step: 'new_nonce' }, + }); + newNonceDuration.add(res.timings.duration); + if (res.status === 429) { + recordRateLimit(res); + return; + } + check(res, { + 'new-nonce 200': (r) => r.status === 200, + 'replay-nonce header present': (r) => !!r.headers['Replay-Nonce'], + }); + } + + // Step 3 — ARI synthetic 4xx fast path. Phase 4 added ARI + // (RFC 9773); this exercises the malformed-cert-id branch which + // returns a 4xx without a DB lookup. Pinning this here means a + // regression that turned the malformed path into a DB query + // would surface as a p95 spike. + if (dir.renewalInfo) { + res = http.get(dir.renewalInfo + '/aaaa.bbbb', { + tags: { scenario: 'acme_burst', step: 'renewal_info' }, + }); + renewalInfoDuration.add(res.timings.duration); + if (res.status === 429) { + recordRateLimit(res); + return; + } + check(res, { + 'renewal-info 4xx for synthetic cert-id': + (r) => r.status === 400 || r.status === 404, + }); + } +} + +// recordRateLimit pins the Phase 5 ACME rate-limit response shape: +// - HTTP 429 +// - Content-Type: application/problem+json +// - Body: {"type":"urn:ietf:params:acme:error:rateLimited", ...} +// A regression that returned 503 or a plain-text 429 or a different +// problem type would NOT increment acme_rate_limit_shape_ok and the +// operator would see (rate_limited_count - shape_ok_count) > 0 in +// the summary. +function recordRateLimit(res) { + rateLimitedCount.add(1); + const ct = res.headers['Content-Type'] || ''; + if (!ct.includes('application/problem+json')) { + return; + } + let body; + try { + body = res.json(); + } catch (e) { + return; + } + if (body && typeof body.type === 'string' && + body.type.startsWith('urn:ietf:params:acme:error:rateLimited')) { + rateLimitShapeOK.add(1); + } +} + +export function handleSummary(data) { + return { + '/results/summary-acme-burst.json': JSON.stringify(data, null, 2), + '/results/summary-acme-burst.txt': textSummary(data, { indent: ' ', enableColors: false }), + stdout: textSummary(data, { indent: ' ', enableColors: true }), + }; +} diff --git a/deploy/test/loadtest/k6/agent_storm.js b/deploy/test/loadtest/k6/agent_storm.js new file mode 100644 index 0000000..3c12c56 --- /dev/null +++ b/deploy/test/loadtest/k6/agent_storm.js @@ -0,0 +1,126 @@ +// Phase 8 SCALE-H2 — agent fleet heartbeat storm. +// +// What this measures: +// 5,000 agents heartbeating at 30s intervals = ~167 heartbeats/sec +// sustained. Each heartbeat is POST /api/v1/agents/{id}/heartbeat +// with optional metadata. Pre-seeded fleet provided by +// deploy/test/loadtest/seed/02_agent_fleet.sql. +// +// What this does NOT measure: +// - The agent work-poll path (GET /api/v1/agents/{id}/work). The +// heartbeat hot path is the highest-frequency call on a typical +// fleet (work-poll cadence is 30s default like heartbeat, but +// work-poll returns the empty set 99% of the time and is cheap; +// heartbeat does an UPDATE on every call). v2 of the harness +// could combine them. +// - The agent CSR-submit path (POST /api/v1/agents/{id}/csr). That +// fires on per-cert issuance, not per heartbeat, and is exercised +// by the existing API tier's POST /api/v1/certificates scenario. +// - Auth-key per-agent rotation. The loadtest stack runs with a +// single api-key (`load-test-token`); per-agent api-key +// hashing/rotation isn't a load axis. +// +// Why constant-arrival-rate (not constant-vus): +// The point is to model what 5K real agents would offer the server +// at their native cadence. 5K agents * (1 heartbeat / 30s) = +// 166.67 req/s offered. constant-arrival-rate fires at exactly +// that rate regardless of latency; if the server backpressures, +// queue builds and p99 shows it. constant-vus would let slow +// responses block, masking the actual ceiling. +// +// Threshold contract: +// - p99 < 1s for the heartbeat POST. The handler does an UPDATE on +// agents.last_heartbeat_at (+ optional metadata columns) and an +// RBAC check. Even at 200 req/s a tight UPDATE on an indexed +// primary key should stay sub-second. +// - p95 < 500ms. +// - Error rate < 0.1%. The seeded agents are all status='Online' +// so no 410 Gone (retired-agent) responses; anything 4xx is a +// bug. 5xx is a server health regression. +// +// Phase 8 reference: +// - Source finding: SCALE-H2. +// - Pre-state: heartbeat path not load-tested. The 100-agent demo +// seed in seed_demo.sql produces ~3 heartbeats/sec, orders of +// magnitude below fleet scale. + +import http from 'k6/http'; +import { check } from 'k6'; +import { textSummary } from 'https://jslib.k6.io/k6-summary/0.0.2/index.js'; + +const BASE = __ENV.CERTCTL_BASE || 'https://certctl-server:8443'; +const TOKEN = __ENV.CERTCTL_TOKEN || 'load-test-token'; + +// 5000 agents * (1 / 30s) = 166.67 heartbeats/sec. Round to 167. +const TARGET_RATE = parseInt(__ENV.K6_AGENT_RATE || '167', 10); + +// Total agents in the fleet seed. The k6 scenario picks an agent at +// random per iteration (deterministic via __ITER) to spread the +// per-row UPDATE pressure across the table. +const FLEET_SIZE = parseInt(__ENV.K6_AGENT_FLEET || '5000', 10); + +export const options = { + scenarios: { + agent_storm: { + executor: 'constant-arrival-rate', + rate: TARGET_RATE, + timeUnit: '1s', + duration: '5m', + preAllocatedVUs: 50, + maxVUs: 200, + exec: 'heartbeat', + tags: { scenario: 'agent_storm' }, + }, + }, + thresholds: { + 'http_req_duration{scenario:agent_storm}': ['p(99)<1000', 'p(95)<500'], + 'http_req_failed{scenario:agent_storm}': ['rate<0.001'], + }, + summaryTrendStats: ['avg', 'min', 'med', 'p(95)', 'p(99)', 'max'], + insecureSkipTLSVerify: true, +}; + +// agentID returns a deterministic agent id from the loadtest fleet +// seed. Spreading round-robin across the fleet means the UPDATE +// pressure hits every row equally rather than the same hot row over +// and over. +function agentID() { + // __ITER is k6's per-VU iteration counter; combined with __VU + // (the VU index) we get a unique-per-call number that spans + // 0..FLEET_SIZE on the modulo. + const idx = (__VU * 1000 + __ITER) % FLEET_SIZE; + return 'ag-loadtest-' + String(idx + 1).padStart(5, '0'); +} + +export function heartbeat() { + const id = agentID(); + // Optional metadata; the heartbeat handler tolerates an empty body + // (no metadata) but real agents send their version + hostname on + // every call so we include them here. + const payload = JSON.stringify({ + version: '2.1.0', + hostname: 'loadtest-' + id.slice(-5) + '.fleet.example.test', + os: 'linux', + architecture: 'amd64', + }); + + const res = http.post(`${BASE}/api/v1/agents/${id}/heartbeat`, payload, { + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${TOKEN}`, + }, + tags: { scenario: 'agent_storm' }, + }); + + check(res, { + 'heartbeat 2xx': (r) => r.status >= 200 && r.status < 300, + }); +} + +export function handleSummary(data) { + return { + '/results/summary-agent-storm.json': JSON.stringify(data, null, 2), + '/results/summary-agent-storm.txt': textSummary(data, { indent: ' ', enableColors: false }), + stdout: textSummary(data, { indent: ' ', enableColors: true }), + }; +} diff --git a/deploy/test/loadtest/k6/bulk_renewal.js b/deploy/test/loadtest/k6/bulk_renewal.js new file mode 100644 index 0000000..eed3b68 --- /dev/null +++ b/deploy/test/loadtest/k6/bulk_renewal.js @@ -0,0 +1,129 @@ +// Phase 8 SCALE-H2 — bulk-renewal under load. +// +// What this measures: +// POST /api/v1/certificates/bulk-renew throughput against a +// 10K-cert pre-seeded fleet. Each iteration POSTs a criteria-mode +// bulk-renew request scoped to a subset of the seeded fleet (by +// tag) so the server enqueues N renewal jobs and returns a +// per-cert {certificate_id, job_id} envelope. +// +// Why criteria-mode (not certificate-ids mode): +// The seeded fleet has a stable `tags.batch = 'bulk-renewal'` +// marker. Criteria-mode lets the scenario re-fire without +// maintaining a moving list of cert IDs and still scopes the +// action to the Phase 8 fixture (no risk of touching a real +// tenant's certs if someone runs the scenario against a non- +// loadtest server by mistake — the criteria simply matches +// nothing). +// +// What this does NOT measure: +// - The scheduler's renewal scan itself. The bulk-renew handler +// enqueues issuance jobs synchronously into the `jobs` table; +// the scheduler's `jobProcessorLoop` picks them up on its next +// tick. The DB write throughput is what's measured here; the +// job-execution path is bounded by per-issuer concurrency +// (CERTCTL_RENEWAL_CONCURRENCY=25 default) and isn't usefully +// amplified by adding more inbound bulk-renew calls. +// - Full POST → poll deployments → cert-served loop. Same v1/v2 +// deferral as the connector-tier scenarios — needs the agent +// poll surface plumbed end-to-end. +// +// Threshold contract: +// - p99 < 5s, p95 < 2s for the bulk-renew POST. Each call walks +// the criteria, materializes the matching managed_certificates +// rows, inserts N rows into `jobs`, and returns the envelope. +// - Error rate < 1%. Anything 4xx/5xx counts. +// +// Phase 8 reference: +// - Source finding: SCALE-H2. +// - Pre-state: only the API tier (50 req/s POST /certificates + +// GET /certificates) and connector tier (per-target handshake) +// were measured. The bulk-renew hot path was uncovered. +// - Seed: deploy/test/loadtest/seed/01_bulk_renewal_certs.sql +// creates 10K rows with tags.batch='bulk-renewal'. The seed +// must run before this scenario; the scale-seed compose +// profile gates this. + +import http from 'k6/http'; +import { check } from 'k6'; +import { textSummary } from 'https://jslib.k6.io/k6-summary/0.0.2/index.js'; + +const BASE = __ENV.CERTCTL_BASE || 'https://localhost:8443'; +const TOKEN = __ENV.CERTCTL_TOKEN || 'load-test-token'; + +// Sustained throughput target. constant-arrival-rate at 5 req/s for 5 +// minutes = 1500 bulk-renew POSTs. Each POST touches up to 10K +// managed_certificates rows (criteria scan) + inserts up to 10K +// rows into `jobs`, so the offered load is higher than the API +// tier's 50 req/s on raw queries-per-second but the per-call +// cost is larger. +// +// 5 req/s was picked deliberately: +// - 50 req/s combined with the API tier's 50 saturates the demo- +// scale compose's DB pool (CERTCTL_DATABASE_MAX_CONNS=50). The +// Phase 8 scenario should measure the per-call ceiling without +// fighting the pool. +// - Each call enqueues thousands of jobs; the scheduler's +// jobProcessorLoop has finite per-tick budget. Pushing higher +// than 5 req/s would queue work faster than the scheduler +// drains it, which produces a transient backlog metric (worth +// measuring eventually) but isn't what SCALE-H2 asks for. +export const options = { + scenarios: { + bulk_renewal: { + executor: 'constant-arrival-rate', + rate: 5, + timeUnit: '1s', + duration: '5m', + preAllocatedVUs: 10, + maxVUs: 30, + exec: 'bulkRenewal', + tags: { scenario: 'bulk_renewal' }, + }, + }, + thresholds: { + // Single-scenario threshold — narrower than the API tier + // because each call is heavier (DB scan + N inserts). + 'http_req_duration{scenario:bulk_renewal}': ['p(99)<5000', 'p(95)<2000'], + 'http_req_failed{scenario:bulk_renewal}': ['rate<0.01'], + }, + summaryTrendStats: ['avg', 'min', 'med', 'p(95)', 'p(99)', 'max'], + insecureSkipTLSVerify: true, +}; + +export function bulkRenewal() { + // Scope by team_id — the seed binds every loadtest cert to + // t-platform; in a production-multi-tenant deploy, team scoping + // is the typical bulk-renew shape. This exercises the criteria + // walker AND the team-scoped permission check in the handler. + // + // NOTE: this does NOT include `tags` because the BulkRenewalCriteria + // domain type (handler/bulk_renewal.go) only exposes profile_id, + // owner_id, agent_id, issuer_id, team_id, certificate_ids — not + // tag-based filtering. The team_id scope plus the production- + // separated FK guarantees we only touch the Phase 8 seed. + const payload = JSON.stringify({ + team_id: 't-platform', + issuer_id: 'iss-local', + }); + + const res = http.post(`${BASE}/api/v1/certificates/bulk-renew`, payload, { + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${TOKEN}`, + }, + tags: { scenario: 'bulk_renewal' }, + }); + + check(res, { + 'bulk-renew 2xx': (r) => r.status >= 200 && r.status < 300, + }); +} + +export function handleSummary(data) { + return { + '/results/summary-bulk-renewal.json': JSON.stringify(data, null, 2), + '/results/summary-bulk-renewal.txt': textSummary(data, { indent: ' ', enableColors: false }), + stdout: textSummary(data, { indent: ' ', enableColors: true }), + }; +} diff --git a/deploy/test/loadtest/seed/01_bulk_renewal_certs.sql b/deploy/test/loadtest/seed/01_bulk_renewal_certs.sql new file mode 100644 index 0000000..8c75eeb --- /dev/null +++ b/deploy/test/loadtest/seed/01_bulk_renewal_certs.sql @@ -0,0 +1,85 @@ +-- Phase 8 SCALE-H2: bulk-renewal scenario seed. +-- +-- Generates 10,000 managed_certificates rows linked to the existing +-- seed_demo.sql FKs (iss-local, o-alice, t-platform, rp-standard) so +-- the bulk-renewal k6 scenario can POST /api/v1/certificates/bulk-renew +-- against a fleet-scale dataset instead of the 15-row demo seed. +-- +-- Behavior: +-- - Idempotent. ON CONFLICT (name) DO NOTHING — re-running the seed +-- against an already-seeded DB is a no-op. +-- - expires_at is uniformly distributed across the next 30 days so +-- a renewal_window_days = 30 policy considers every row eligible. +-- - status = 'active' so the renewal selector treats them as +-- live (the scheduler skips status IN ('pending', 'failed', +-- 'revoked', 'retired')). +-- - name is generated as 'loadtest-bulk-NNNNN.example.test' for a +-- stable, predictable identifier the k6 scenario can pattern-match +-- to scope its criteria to the seeded set (the production fleet +-- wouldn't share this prefix). +-- +-- Volume target: 10,000 rows. Insert wall time on the loadtest stack +-- (postgres:16-alpine, 2 CPU / 4 GiB): typically < 5 seconds via the +-- single-statement generate_series + INSERT pattern below. The +-- compose seed-init container runs this BEFORE the k6 driver starts, +-- so the steady-state load measurement isn't affected by seed time. +-- +-- Why not generated in Go via a fixtures helper: +-- - The certctl-server boots from a clean DB and runs migrations + +-- seed_demo.sql automatically when CERTCTL_DEMO_SEED=true. Adding +-- a Go-side fixtures helper would require either (a) a new +-- CERTCTL_LOADTEST_SEED flag wired into cmd/server/main.go (cross- +-- cutting change for one test path) or (b) a separate seed binary +-- (more compose surface). Raw SQL is the smallest viable change. +-- +-- Phase 8 entry point — runs only when the loadtest compose stack is +-- explicitly opted into the scale-seed via LOADTEST_SCALE_SEED=true. + +INSERT INTO managed_certificates ( + id, + name, + common_name, + sans, + environment, + owner_id, + team_id, + issuer_id, + renewal_policy_id, + status, + expires_at, + tags, + created_at, + updated_at +) +SELECT + 'cert-loadtest-bulk-' || lpad(g::text, 5, '0'), + 'loadtest-bulk-' || lpad(g::text, 5, '0') || '.example.test', + 'loadtest-bulk-' || lpad(g::text, 5, '0') || '.example.test', + ARRAY['loadtest-bulk-' || lpad(g::text, 5, '0') || '.example.test'], + 'loadtest', + 'o-alice', + 't-platform', + 'iss-local', + 'rp-standard', + 'active', + -- Distribute expires_at uniformly across the next 30 days so a + -- 30-day-window renewal policy sees every row as eligible. + NOW() + ((g % 30) || ' days')::interval + ((g % 24) || ' hours')::interval, + jsonb_build_object('source', 'loadtest-phase8', 'batch', 'bulk-renewal'), + NOW(), + NOW() +FROM generate_series(1, 10000) AS g +ON CONFLICT (name) DO NOTHING; + +-- Confirmation row count — the seed-init container greps this in its +-- logs to verify the fleet shape post-insert. The output appears in +-- `docker compose logs certctl-loadtest-scale-seed` after the run. +DO $$ +DECLARE + cert_count integer; +BEGIN + SELECT COUNT(*) INTO cert_count + FROM managed_certificates + WHERE name LIKE 'loadtest-bulk-%'; + RAISE NOTICE 'Phase 8 bulk-renewal seed: % managed_certificates rows present', cert_count; +END $$; diff --git a/deploy/test/loadtest/seed/02_agent_fleet.sql b/deploy/test/loadtest/seed/02_agent_fleet.sql new file mode 100644 index 0000000..f5166ca --- /dev/null +++ b/deploy/test/loadtest/seed/02_agent_fleet.sql @@ -0,0 +1,85 @@ +-- Phase 8 SCALE-H2: agent-fleet heartbeat-storm scenario seed. +-- +-- Generates 5,000 agents rows so the heartbeat-storm k6 scenario can +-- model a fleet-scale heartbeat pattern (5K agents heartbeating at the +-- native 30s cadence = ~167 heartbeats/sec sustained) instead of the +-- ~10-agent demo seed. +-- +-- Behavior: +-- - Idempotent. ON CONFLICT (id) DO NOTHING — re-runnable against an +-- already-seeded DB. +-- - name is unique (a UNIQUE constraint in migration 000001) so the +-- name suffix mirrors the id suffix. +-- - status = 'Online' so the heartbeat handler's retire-check +-- (service.ErrAgentRetired) doesn't 410 the storm. +-- - last_heartbeat_at staggered across the prior 60 seconds so the +-- stale-agent reaper (agentHealthCheckLoop) doesn't immediately +-- flip half the fleet to 'Offline' during the first scheduler +-- tick of the load run. +-- - api_key_hash = 'loadtest_no_auth'. The loadtest compose runs +-- CERTCTL_AUTH_TYPE=api-key with a single static token +-- (load-test-token), which bypasses per-agent key check the same +-- way the existing API tier scenarios do. Production deploys with +-- CERTCTL_AUTH_TYPE=agent-key per-agent would seed real bcrypt'd +-- hashes; this column is opaque to the load-test path. +-- - registered_at = NOW() - random 1-90 day interval so agent age +-- looks realistic and any age-based query plans are exercised. +-- +-- Volume target: 5,000 rows. The agents schema is much narrower than +-- managed_certificates so the insert is sub-second on the loadtest +-- stack. The 5K agents do not own any deployment_targets in this +-- fixture (the scenario only measures the heartbeat hot path, not +-- the work-poll path which depends on cert + target wiring). +-- +-- Phase 8 entry point — runs only when the loadtest compose stack is +-- explicitly opted into the scale-seed via LOADTEST_SCALE_SEED=true. + +INSERT INTO agents ( + id, + name, + hostname, + status, + last_heartbeat_at, + registered_at, + api_key_hash, + os, + architecture, + ip_address, + version +) +SELECT + 'ag-loadtest-' || lpad(g::text, 5, '0'), + 'loadtest-agent-' || lpad(g::text, 5, '0'), + 'loadtest-' || lpad(g::text, 5, '0') || '.fleet.example.test', + 'Online', + -- Stagger last_heartbeat_at across the prior 60 seconds (= 2x the + -- agent's native poll interval) so the first wave of incoming + -- heartbeats doesn't all arrive in lockstep at t=0. + NOW() - ((g % 60) || ' seconds')::interval, + -- Registered_at randomized 1-90 days back. + NOW() - ((g % 90 + 1) || ' days')::interval, + 'loadtest_no_auth', + -- Mix linux/windows/darwin so the OS distribution column in the + -- agents page isn't pure-linux during the storm. + CASE (g % 10) + WHEN 0 THEN 'windows' + WHEN 1 THEN 'darwin' + ELSE 'linux' + END, + -- amd64 dominates; arm64 minority. + CASE WHEN (g % 5) = 0 THEN 'arm64' ELSE 'amd64' END, + -- IPv4 in the 10.42.0.0/16 fleet range, deterministic per id. + '10.42.' || ((g / 256) % 256)::text || '.' || (g % 256)::text, + '2.1.0' +FROM generate_series(1, 5000) AS g +ON CONFLICT (id) DO NOTHING; + +DO $$ +DECLARE + agent_count integer; +BEGIN + SELECT COUNT(*) INTO agent_count + FROM agents + WHERE id LIKE 'ag-loadtest-%'; + RAISE NOTICE 'Phase 8 agent-storm seed: % agents rows present', agent_count; +END $$; diff --git a/deploy/test/loadtest/seed/README.md b/deploy/test/loadtest/seed/README.md new file mode 100644 index 0000000..93672ec --- /dev/null +++ b/deploy/test/loadtest/seed/README.md @@ -0,0 +1,87 @@ +# Phase 8 load-test seed fixtures + +Opt-in seed scripts that grow the loadtest DB from the demo-scale +fixture (~15 certs / ~10 agents from `migrations/seed_demo.sql`) to +fleet scale (10K certs + 5K agents) so the Phase 8 SCALE-H2 scenarios +measure something representative. + +## When these run + +The default `make loadtest` path does NOT touch this directory — the +API tier and connector tier scenarios run against the demo seed alone +and complete in ~5 minutes. The Phase 8 scenarios opt-in via the +`LOADTEST_SCALE_SEED=true` environment variable; when set, the +`certctl-loadtest-scale-seed` one-shot init container runs every +`*.sql` file in this directory in lexical order against the same +Postgres instance the server uses. + +Compose service wiring (see `../docker-compose.yml`): +- Service: `scale-seed` +- Profile: `scale-seed` (compose `profiles:` gate; not started by + default) +- Depends on: `postgres` (service_healthy) AND `certctl-server` + (service_healthy — server runs schema migrations at boot so the + seed runs AFTER tables exist) +- Order: lexical (`01_bulk_renewal_certs.sql` then + `02_agent_fleet.sql`) +- Idempotent: every script uses `ON CONFLICT DO NOTHING` so re-running + is a no-op. + +## What gets seeded + +| File | Rows | Purpose | +|---|---|---| +| `01_bulk_renewal_certs.sql` | 10,000 managed_certificates | Fleet shape for `bulk_renewal.js`. All linked to demo FKs (iss-local, o-alice, t-platform, rp-standard). Status `active`, expires_at distributed across the next 30 days so a 30-day renewal window considers every row eligible. Name prefix `loadtest-bulk-` so the k6 scenario can scope its bulk-renew criteria. | +| `02_agent_fleet.sql` | 5,000 agents | Fleet shape for `agent_storm.js`. Status `Online`, last_heartbeat_at staggered across prior 60s, name prefix `loadtest-agent-`. OS distribution: 80% linux / 10% windows / 10% darwin. Arch: 80% amd64 / 20% arm64. | + +## How to run the Phase 8 scenarios locally + +```bash +cd deploy/test/loadtest +LOADTEST_SCALE_SEED=true docker compose --profile scale-seed up --build \ + --abort-on-container-exit --exit-code-from k6-scale +``` + +Or via the dedicated Makefile target (preferred for CI parity): + +```bash +make loadtest-scale +``` + +## Why SQL fixtures instead of a Go seed binary + +- The certctl-server already boots from a clean DB and runs migrations + + `seed_demo.sql` when `CERTCTL_DEMO_SEED=true`. Adding a third seed + mode (loadtest-scale) would mean either a new + `CERTCTL_LOADTEST_SEED` flag wired into `cmd/server/main.go` (cross- + cutting change for one test path) or a separate seed binary (more + compose surface). +- Raw SQL is the smallest viable change: each script is a single + multi-row `INSERT … SELECT FROM generate_series(…)` plus a + `DO $$ … RAISE NOTICE` confirmation block. +- Idempotency is straightforward via `ON CONFLICT … DO NOTHING` — the + same pattern `seed_demo.sql` uses. + +## Why these volumes specifically + +- **10K certs.** The SCALE-H2 audit asked for "10K certs with + renewal_at < now." Round number, fits in postgres:16-alpine on a + CI runner without OOM, and large enough that the renewal selector's + query plan is exercised (the demo's 15 rows would index-scan + trivially). +- **5K agents.** Heartbeat at 30s cadence = ~167 heartbeats/sec + sustained. That's well above the 50 req/s the existing API tier + measures and stresses the agent.heartbeat handler's per-call cost + (last_heartbeat_at UPDATE + the RBAC permission check + the + audit-log row). + +If a future scenario needs more rows (50K certs / 10K agents), add a +new `03_…sql` here and another scenario file. Don't grow the existing +files — re-running existing scenarios against a different fixture +shape would invalidate the captured baseline. + +## Phase 8 audit reference + +Source finding: SCALE-H2 in +`cowork/certctl-architecture-diligence-audit.html`. +Phase 8 closure commit: see `git log --grep='Phase 8'`. diff --git a/docs/operator/scale.md b/docs/operator/scale.md index 51bd8a3..9ccefd7 100644 --- a/docs/operator/scale.md +++ b/docs/operator/scale.md @@ -121,6 +121,116 @@ endpoint and repeat the request with the same value in an `If-None-Match:` header — the second request should return 304 with an empty body. +## Scale-tier scenarios (SCALE-H2, Phase 8) + +Phase 8 (2026-05-14) extended the k6 load-test harness with three new +scenarios that exercise the scale-relevant load surfaces the original +API tier left uncovered. They live behind a compose profile gate +(`docker compose --profile scale`) so the default `make loadtest` +stays focused on per-PR regression scope. The full set runs weekly on +the same `loadtest.yml` cron as the API + connector tier. + +| Scenario | k6 file | Seed fixture | Sustained load | +|---|---|---|---| +| Bulk-renewal under load | `deploy/test/loadtest/k6/bulk_renewal.js` | 10,000 managed_certificates (`seed/01_bulk_renewal_certs.sql`) | 5 req/s POST `/api/v1/certificates/bulk-renew` × 5 min | +| ACME enrollment burst | `deploy/test/loadtest/k6/acme_burst.js` | (none — unauth surface) | 200 concurrent VUs × directory/nonce/ARI × 5 min | +| Agent heartbeat storm | `deploy/test/loadtest/k6/agent_storm.js` | 5,000 agents (`seed/02_agent_fleet.sql`) | 167 req/s POST `/api/v1/agents/{id}/heartbeat` × 5 min | + +### Threshold contracts (regression guards, NOT measured baselines) + +| Scenario | Metric | Threshold | +|---|---|---| +| Bulk-renewal | `http_req_duration{scenario:bulk_renewal}` p99 | < 5 s | +| Bulk-renewal | `http_req_duration{scenario:bulk_renewal}` p95 | < 2 s | +| Bulk-renewal | `http_req_failed{scenario:bulk_renewal}` | < 1% | +| ACME burst | `acme_directory_duration` p95 | < 500 ms | +| ACME burst | `acme_new_nonce_duration` p95 | < 300 ms | +| ACME burst | `acme_renewal_info_duration` p95 | < 800 ms | +| ACME burst | `http_req_failed{server_error:true}` 5xx-only | < 0.1% | +| Agent storm | `http_req_duration{scenario:agent_storm}` p99 | < 1 s | +| Agent storm | `http_req_duration{scenario:agent_storm}` p95 | < 500 ms | +| Agent storm | `http_req_failed{scenario:agent_storm}` | < 0.1% | + +429 rate-limit responses on the ACME burst are EXPECTED — Phase 5's +per-account rate limiter SHOULD fire at sustained 200-VU pressure. +The custom `acme_rate_limited_count` Counter tracks how often it +fires; `acme_rate_limit_shape_ok` Counter verifies every 429 returns +the RFC 7807 `application/problem+json` shape with the +`urn:ietf:params:acme:error:rateLimited` type. A regression that +returned plain-text 429 or a different problem type would surface as +`(rate_limited_count - shape_ok_count) > 0` in the summary. + +### Measured baseline — TBD pending canonical-hardware capture + +The Phase 8 scenarios shipped 2026-05-14. Baseline capture on a +canonical `ubuntu-latest` GitHub runner is the next operational step; +until then, the table below holds TBD placeholders. **Do NOT publish +sandbox-captured numbers here** — the same anti-pattern the original +loadtest README guards against (sandbox-aggregate placeholder vs +canonical hardware) applies to Phase 8. + +| Scenario | p50 | p95 | p99 | Error rate | Date measured | Commit | +|---|---|---|---|---|---|---| +| **bulk_renewal** | TBD | TBD | TBD | TBD | — | — | +| **acme_burst** directory | TBD | TBD | TBD | TBD | — | — | +| **acme_burst** new-nonce | TBD | TBD | TBD | TBD | — | — | +| **acme_burst** renewal-info | TBD | TBD | TBD | TBD | — | — | +| **agent_storm** | TBD | TBD | TBD | TBD | — | — | + +Capture procedure: trigger `loadtest.yml` from the Actions tab against +the current `master` SHA; wait for the `k6-scale` matrix jobs to +complete; download the per-scenario summary artifacts; copy p50/p95/ +p99 from `summary-.json` into the table; commit the +captured numbers alongside the date + SHA. Replace this paragraph +with the captured-on row when the first canonical run lands. + +### How to run the scale tier locally + +```sh +# All three scenarios serially (~18 min total): +make loadtest-scale + +# Individual scenarios (each ~6 min): +make loadtest-scale-bulk # 10K cert bulk-renew +make loadtest-scale-acme # 200 VU ACME burst +make loadtest-scale-agent # 5K agent heartbeat storm +``` + +Each scenario boots its own copy of the loadtest compose stack +(postgres + tls-init + certctl-server) plus the `scale-seed` init +container that runs the SQL fixtures from `deploy/test/loadtest/seed/`. +The seed is idempotent (`ON CONFLICT … DO NOTHING`) so re-running a +scenario against the same compose stack is cheap. + +### Documented limitations of the scale tier + +- **JWS-signed ACME flows are not measured.** The ACME burst scenario + hits the unauthenticated directory + new-nonce + ARI surface only. + Measuring the JWS-signed POST hot path (new-account / new-order / + finalize) requires bundling a JWS signer into the k6 driver (k6 + doesn't ship JWS). End-to-end JWS conformance is gated by + `make acme-rfc-conformance-test` which drives `lego` against the + same stack. +- **Scheduler renewal scan throughput.** The bulk-renewal scenario + measures the inbound POST throughput; the scheduler's + `jobProcessorLoop` drains the enqueued jobs at a fixed per-tick + budget (`CERTCTL_RENEWAL_CONCURRENCY=25` default), and the + throughput of that path is not amplified by adding more inbound + bulk-renew calls. A future scenario could pull + `/api/v1/jobs?status=pending` and measure drain time. +- **Production-sized Postgres.** The compose stack runs + `postgres:16-alpine` with default config on a CI runner. + Production deploys with `shared_buffers >= 1 GiB` + dedicated + Postgres VM will have different query plans for the 10K-cert + scan. The captured numbers translate directionally but the + absolute ceiling is workload-specific — see the operator-tune + ladder above for production sizing. +- **Pull-only deployment model.** Agent CSR submit, work-poll, and + deploy-verify paths are intentionally out of scope. The heartbeat + storm exercises the highest-frequency call on a typical fleet; + the work-poll path runs at the same cadence but is cheap (empty + set returned 99% of the time). + ## Profiling production When the above ladder doesn't fit your shape, profile against your