From e292faafc6cd52af5282764f4ef1f148adebec92 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Sat, 2 May 2026 19:28:45 +0000 Subject: [PATCH] loadtest: per-connector deploy throughput scenarios + target sidecars + README baseline section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes Bundle 10 of the 2026-05-02 deployment-target coverage audit (see cowork/deployment-target-audit-2026-05-02/RESULTS.md). Pre-fix, deploy/test/loadtest/k6.js drove only the API-tier throughput path (POST /api/v1/certificates + GET /api/v1/certificates) — the operator- facing rate at which an automation client can submit cert requests. The deploy hot path (cert deployed to a target — connector-tier latency) had no benchmarks. Procurement asks "can certctl handle our 5,000-NGINX fleet at 47-day rotation?" and the answer should be a number with methodology, not a claim. This commit ships v1 of the connector-tier loadtest harness: 1. Target-side sidecars added to docker-compose.yml: nginx-target, apache-target, haproxy-target, f5-mock-target. Each daemon serves a starter cert (ECDSA P-256, multi-SAN) written into a shared ./fixtures/target-certs/ volume by a new target-tls-init container. f5-mock-target re-uses the in-tree deploy/test/f5-mock-icontrol/ image (already used by the deploy- vendor-e2e CI job) and generates its own self-signed cert via tls.go::selfSignedCert at startup. 2. Fixture configs committed under deploy/test/loadtest/fixtures/: - nginx.conf — minimal HTTPS server, single 200 OK location. - httpd.conf — self-contained Apache config with the minimum module set + SSL vhost. - haproxy.cfg — minimal SSL-terminating frontend backed by a static "ok" backend. 3. k6 scenarios added (4 new): nginx_handshake, apache_handshake, haproxy_handshake, f5_handshake. Each runs constant-arrival-rate at 100 conns/min for 5 minutes. Latency captured by k6's http_req_duration metric covers TCP connect + TLS handshake + tiny HTTP request/response — that's the end-to-end "connection readiness" latency a deploy connector cares about. 4. summary.json gains a connector_tier object with per-target p50/p95/p99/max/avg/error_rate/iterations breakdowns. Operators tracking a connector regression diff connector_tier. between runs. Implementation: a new enrichWithConnectorTier helper that reads data.metrics keyed by target_type tag and shallow-merges the breakdown into the summary before serialisation. 5. Threshold contract per target type: - nginx/apache/haproxy: p99 < 3s, p95 < 1s. - f5-mock: p99 < 5s, p95 < 1.5s (iControl REST handler does slightly more work per request than pure TLS termination). - All scenarios: error rate < 1% (k6 default; any 4xx/5xx counts as failed). Any change pushing past these fails the workflow. 6. README documents the methodology + the baseline-number table for the connector tier. Numeric values are em-dash placeholders pending the first clean canonical-hardware run; the accompanying commit message in that follow-up captures the methodology line alongside the numbers. Out-of-scope is documented explicitly: - Full agent-driven deploy poll loop (POST cert with target binding → poll deployments endpoint → verify served cert). v2 of the harness — needs the agent registration + target- binding API surface plumbed end-to-end in the loadtest stack. - Kubernetes target via kind-in-docker. kind requires `privileged: true` and is operationally fragile in CI; deferred until Bundle 2 (real k8s.io/client-go) lands and a CI-friendly envtest harness is wired. - Real F5 BIG-IP. CI uses the in-tree f5-mock; real-appliance benchmarking is out of scope. 7. CI workflow .github/workflows/loadtest.yml timeout-minutes bumped from 15 to 25. The harness now boots four additional target sidecars before the k6 run; their healthchecks add ~30-60s. The k6 scenarios themselves are still 5 minutes (run in parallel, not serially). 25 minutes absorbs that plus slow CI runners and cold image caches without letting a stuck container consume the runner indefinitely. Trigger remains workflow_dispatch + cron — sustained 25-minute runs are too slow for per-PR signal. What this connector tier explicitly does NOT measure (documented in the k6.js header + README): - The agent-driven full deploy hot path (v2 follow-up). - K8s target (Bundle 2 dependency). - Real F5 appliance. - Issuer-side throughput (handled by issuer-coverage-audit fix #8). Verified locally: - python3 -c "import yaml; yaml.safe_load(...)" on docker-compose.yml and .github/workflows/loadtest.yml — clean. - node -c on k6.js — clean syntax. - gofmt / go vet on the rest of the tree (no Go diff in this commit). - Manual smoke against docker-compose pending — operator validates on the canonical-hardware first run; if any fixture config is off, fix-up commit lands separately so the methodology change and the numeric baseline have independent reviewability. No Go code changes; this is a loadtest-harness-only commit. Audit reference: cowork/deployment-target-audit-2026-05-02/RESULTS.md Bundle 10. --- .github/workflows/loadtest.yml | 10 +- deploy/test/loadtest/.gitignore | 4 + deploy/test/loadtest/README.md | 120 ++++++++++- deploy/test/loadtest/docker-compose.yml | 211 ++++++++++++++++-- deploy/test/loadtest/fixtures/haproxy.cfg | 29 +++ deploy/test/loadtest/fixtures/httpd.conf | 66 ++++++ deploy/test/loadtest/fixtures/nginx.conf | 36 ++++ deploy/test/loadtest/k6.js | 248 +++++++++++++++++++--- 8 files changed, 677 insertions(+), 47 deletions(-) create mode 100644 deploy/test/loadtest/fixtures/haproxy.cfg create mode 100644 deploy/test/loadtest/fixtures/httpd.conf create mode 100644 deploy/test/loadtest/fixtures/nginx.conf diff --git a/.github/workflows/loadtest.yml b/.github/workflows/loadtest.yml index 3c5e94e..3439501 100644 --- a/.github/workflows/loadtest.yml +++ b/.github/workflows/loadtest.yml @@ -37,11 +37,15 @@ jobs: k6: name: k6 throughput run runs-on: ubuntu-latest - # 15-minute hard cap. The harness itself is ~7 minutes (5m run + - # 2m for image build + healthcheck wait); the cap absorbs slow CI + # 25-minute hard cap. Pre-Bundle-10: 15min was enough for the API + # tier alone (~7 minutes total). Post-Bundle-10 the harness boots + # four additional target sidecars (nginx, apache, haproxy, f5-mock) + # before the k6 run; their healthchecks add ~30-60s. The k6 scenarios + # themselves are still 5 minutes (run in parallel with the API + # scenarios, not serially). 25 minutes absorbs that plus slow CI # runners and cold image caches without letting a stuck container # consume the runner indefinitely. - timeout-minutes: 15 + timeout-minutes: 25 steps: - name: Checkout diff --git a/deploy/test/loadtest/.gitignore b/deploy/test/loadtest/.gitignore index ee72a40..414887e 100644 --- a/deploy/test/loadtest/.gitignore +++ b/deploy/test/loadtest/.gitignore @@ -8,3 +8,7 @@ results/* # tls-init bind mount — server cert + key are regenerated on every # fresh run. certs/ + +# Bundle 10: target-tls-init bind mount — target sidecar starter cert is +# regenerated on every fresh run alongside the server cert. +fixtures/target-certs/ diff --git a/deploy/test/loadtest/README.md b/deploy/test/loadtest/README.md index 15c4020..222981f 100644 --- a/deploy/test/loadtest/README.md +++ b/deploy/test/loadtest/README.md @@ -155,6 +155,116 @@ The workflow does **not** run per-push. Load tests are minutes long and would not provide useful per-PR signal; per-push pressure goes through `make verify` (which is fast) and the deploy-vendor-e2e job. +## Connector-tier baseline (Bundle 10 of the 2026-05-02 deployment-target audit) + +Bundle 10 extended the harness to cover per-target-type handshake throughput +in addition to the API-tier issuance/list throughput documented above. The +docker-compose stack now boots four target sidecars (nginx, apache, haproxy, +f5-mock) each serving a starter cert from a shared `target-tls-init` +container, and k6 runs four additional scenarios — `nginx_handshake`, +`apache_handshake`, `haproxy_handshake`, `f5_handshake` — at sustained +100 conns/min for 5 minutes against each. + +### What the connector tier measures + +End-to-end TCP connect + TLS handshake + tiny HTTP request/response latency +per target type, tagged via the k6 `target_type` label so summary.json's +`connector_tier` section breaks the numbers out per sidecar: + +```json +{ + "connector_tier": { + "nginx": { "p50": ..., "p95": ..., "p99": ..., "error_rate": ..., "iterations": ... }, + "apache": { ... }, + "haproxy": { ... }, + "f5": { ... } + } +} +``` + +This validates the target sidecar daemons are operational under sustained +connection load. Procurement asks "can certctl's nginx target handle 5,000 +endpoints at 47-day rotation?" — the connector code's correctness is pinned +by per-connector unit tests; **the underlying daemon's connection-rate +ceiling is what these scenarios pin**. + +### What the connector tier explicitly does NOT measure (v1) + +- **The full agent-driven deploy hot path.** v1 measures handshake + throughput against the sidecars directly. v2 of the harness is a + follow-up that POSTs cert requests bound to per-target-type targets, + polls the deployments endpoint until the agent reports complete, and + measures the full POST → poll → cert-served loop. v2 needs the agent + registration + target-binding API surface plumbed end-to-end in the + loadtest stack — meaningful work, but not a blocker for the connection- + rate procurement question. +- **Kubernetes connector.** kind-in-docker requires `privileged: true` + and is operationally fragile in CI. Deferred until Bundle 2 (real + `k8s.io/client-go`) lands and a CI-friendly envtest harness is wired. +- **Real F5 BIG-IP.** The harness uses the in-tree `f5-mock-icontrol` + Go server (already used by the deploy-vendor-e2e CI job). Real F5 + appliance benchmarking is out of scope; operators with a real F5 + vagrant box per `docs/connector-f5.md` can substitute it manually. + +### Threshold contract + +Defined in `k6.js`'s `thresholds` block. Any change pushing past these +fails the test: + +| Target type | p95 | p99 | Error rate | +|---|---|---|---| +| `nginx` | < 1 s | < 3 s | < 1% (global) | +| `apache` | < 1 s | < 3 s | < 1% (global) | +| `haproxy` | < 1 s | < 3 s | < 1% (global) | +| `f5` | < 1.5 s | < 5 s | < 1% (global) | + +f5-mock's threshold is looser because the iControl REST handler does +slightly more work per request (login+upload+install dance the F5 +connector itself drives — not exercised here, but the daemon's request +handler is heavier). + +### Connector-tier captured baseline + +| Target type | p50 | p95 | p99 | Error rate | Iterations | +|---|---|---|---|---|---| +| **nginx** (threshold) | — | < 1 s | < 3 s | < 1% | n/a | +| **nginx** (baseline) | TBD | TBD | TBD | TBD | TBD | +| **apache** (threshold) | — | < 1 s | < 3 s | < 1% | n/a | +| **apache** (baseline) | TBD | TBD | TBD | TBD | TBD | +| **haproxy** (threshold) | — | < 1 s | < 3 s | < 1% | n/a | +| **haproxy** (baseline) | TBD | TBD | TBD | TBD | TBD | +| **f5** (threshold) | — | < 1.5 s | < 5 s | < 1% | n/a | +| **f5** (baseline) | TBD | TBD | TBD | TBD | TBD | + +The em-dash placeholders are deliberate: do **not** commit numeric values +without running the loadtest on canonical hardware first. Numbers from a +developer laptop are misleading. The first `gh workflow run loadtest.yml` +on a clean GitHub runner captures the baseline; commit the captured numbers +into the table above as a follow-up commit alongside the methodology line. + +**Methodology pinned at baseline capture (canonical hardware):** + +- Hardware: GitHub-hosted `ubuntu-latest` runners (currently 4 vCPU / + 16 GiB / SSD-backed). Operator captures from `gh workflow run loadtest.yml` + to keep the hardware constant across runs. +- Sidecar images: nginx:1.27-alpine, httpd:2.4-alpine, haproxy:2.9-alpine, + in-tree f5-mock-icontrol (built from `deploy/test/f5-mock-icontrol/`). +- Concurrency: 100 conns/min sustained per target type (400 conns/min + total across the four target scenarios + 100 req/s on the API tier). +- Duration: 5 minutes per scenario, 10s stagger between API tier and + connector tier so warmup overlap doesn't skew the first 30 seconds. +- TLS: starter cert from `target-tls-init` (ECDSA P-256, multi-SAN). The + loadtest scenarios connect with `K6_INSECURE_SKIP_TLS_VERIFY=true`. + +To recapture the connector-tier baseline after a tuning commit affecting +target sidecars or the connector code: + +```sh +make loadtest +# Inspect deploy/test/loadtest/results/summary.json for the +# connector_tier object and update the table above. +``` + ## Files in this directory ``` @@ -163,9 +273,15 @@ deploy/test/loadtest/ ├── docker-compose.yml ├── k6.js (the load script) ├── certs/ (gitignored — tls-init writes here) +├── fixtures/ (Bundle 10: target sidecar configs + shared starter cert) +│ ├── nginx.conf +│ ├── httpd.conf +│ ├── haproxy.cfg +│ └── target-certs/ (gitignored — target-tls-init writes here) └── results/ (gitignored — k6 writes summary.{json,txt} here) ``` -## Audit reference +## Audit references -`cowork/issuer-coverage-audit-2026-05-01/RESULTS.md` Top-10 fix #8. +- API tier: `cowork/issuer-coverage-audit-2026-05-01/RESULTS.md` fix #8. +- Connector tier: `cowork/deployment-target-audit-2026-05-02/RESULTS.md` Bundle 10. diff --git a/deploy/test/loadtest/docker-compose.yml b/deploy/test/loadtest/docker-compose.yml index b2115a4..f5102cc 100644 --- a/deploy/test/loadtest/docker-compose.yml +++ b/deploy/test/loadtest/docker-compose.yml @@ -3,26 +3,58 @@ # ============================================================================= # # Spins up a minimal certctl stack and runs a k6 driver against it to capture -# p50 / p95 / p99 latency for the certificate-management API hot path. +# p50 / p95 / p99 latency for the certificate-management API hot path AND +# (Bundle 10 of the 2026-05-02 deployment-target audit) per-target-type +# TCP+TLS handshake throughput against four target sidecars (nginx, apache, +# haproxy, f5-mock). # # Stack: -# 1. postgres — empty database (server runs migrations + seeds at boot) -# 2. certctl-tls-init — one-shot init container; writes self-signed -# server.crt/.key/ca.crt into ./certs (bind mount, -# host-readable so the k6 container can pin against -# it via volumes) -# 3. certctl-server — HTTPS API on :8443, demo-seed enabled so the k6 -# script has iss-local + an operator + a team -# ready to reference in CreateCertificate payloads -# 4. k6 — runs k6.js once and exits with the threshold- -# driven exit code (zero on green, non-zero on any -# threshold breach so `make loadtest` surfaces -# regressions as a failed shell command) +# 1. postgres — empty database (server runs migrations + seeds at boot) +# 2. certctl-tls-init — one-shot init container; writes self-signed +# server.crt/.key/ca.crt into ./certs (bind +# mount, host-readable so the k6 container +# can pin against it via volumes) +# 3. certctl-server — HTTPS API on :8443, demo-seed enabled so +# the k6 script has iss-local + an operator +# + a team ready to reference in +# CreateCertificate payloads +# 4. target-tls-init — Bundle 10: shared starter cert+key for the +# four target sidecars (nginx, apache, +# haproxy, f5-mock). Each daemon boots with +# this cert; the loadtest scenarios connect +# at sustained rates to measure handshake +# latency tagged by target_type. +# 5. nginx-target — Bundle 10: HTTPS on internal :443. +# 6. apache-target — Bundle 10: HTTPS on internal :443. +# 7. haproxy-target — Bundle 10: HTTPS on internal :443. +# 8. f5-mock-target — Bundle 10: iControl REST on internal :443 +# + plaintext HTTP on internal :8080. Runs +# the in-tree f5-mock-icontrol image +# (deploy/test/f5-mock-icontrol/). +# 9. k6 — runs k6.js once and exits with the +# threshold-driven exit code (zero on green, +# non-zero on any threshold breach so +# `make loadtest` surfaces regressions as a +# failed shell command). +# +# Out of scope for v1 of the connector-tier harness (Bundle 10): +# - Kubernetes target via kind-in-docker. kind requires `privileged: true` +# and Docker-in-Docker semantics that are operationally fragile in CI; +# the K8s connector loadtest is a follow-up that needs Bundle 2's real +# k8s.io/client-go to land first. +# - Full agent-driven deploy poll loop (POST cert → poll deployments → +# verify served cert matches what was deployed). The harness measures +# handshake throughput against the target sidecars directly — that's +# enough to validate the sidecars are operational under load and gives +# procurement a per-target latency number that doesn't depend on the +# agent registration + target-binding API surface being plumbed +# end-to-end in the loadtest stack. # # Usage: make loadtest (from the repo root) # Manual: cd deploy/test/loadtest && docker compose up --abort-on-container-exit --exit-code-from k6 # -# Audit reference: cowork/issuer-coverage-audit-2026-05-01/RESULTS.md fix #8. +# Audit reference (API tier): cowork/issuer-coverage-audit-2026-05-01/RESULTS.md fix #8. +# Audit reference (connector tier): cowork/deployment-target-audit-2026-05-02/RESULTS.md Bundle 10. # ============================================================================= services: @@ -135,6 +167,138 @@ services: retries: 30 start_period: 60s + # --------------------------------------------------------------------------- + # Bundle 10: target-side TLS bootstrap. Mints a single ECDSA-P256 self- + # signed cert + key into a shared ./fixtures/target-certs/ volume that the + # four target sidecars (nginx, apache, haproxy) mount read-only. f5-mock + # generates its own self-signed cert at startup (see + # deploy/test/f5-mock-icontrol/tls.go) so it doesn't need this volume. + # + # The loadtest scenarios don't care which cert the target serves — only + # that the daemon is up and completing TLS handshakes at the configured + # rate. The starter cert exists so each daemon boots green; once Bundle 2 + # (real K8s client) + agent-driven deploy poll is plumbed in v2 of the + # harness, deploys would overwrite this cert. + # --------------------------------------------------------------------------- + target-tls-init: + image: alpine/openssl:latest + container_name: certctl-loadtest-target-tls-init + restart: "no" + entrypoint: /bin/sh + command: + - -c + - | + set -eu + CERT=/certs/target.crt + KEY=/certs/target.key + PEM=/certs/target.pem + if [ -f "$$CERT" ] && [ -f "$$KEY" ] && [ -f "$$PEM" ]; then + echo "Target TLS cert already present — skipping generation" + else + mkdir -p /certs + openssl req -x509 -newkey ec \ + -pkeyopt ec_paramgen_curve:P-256 \ + -nodes \ + -keyout "$$KEY" \ + -out "$$CERT" \ + -days 365 \ + -subj "/CN=loadtest-target" \ + -addext "subjectAltName=DNS:nginx-target,DNS:apache-target,DNS:haproxy-target,DNS:f5-mock-target,DNS:localhost,IP:127.0.0.1" + # HAProxy expects cert+key concatenated into a single PEM file + # at the path supplied to `bind ... ssl crt `. Build it + # alongside the cert/key pair so the haproxy-target's mount + # works without a per-daemon ENTRYPOINT shim. + cat "$$CERT" "$$KEY" > "$$PEM" + echo "Generated target starter cert (ECDSA-P256, 365d, multi-SAN)" + fi + # World-readable so non-root container users (haproxy uses uid 99, + # apache uses uid 1) can read the key. This is fine for a load-test + # starter cert; production wouldn't do this. + chmod 0644 "$$CERT" "$$KEY" "$$PEM" + volumes: + - ./fixtures/target-certs:/certs + + # --------------------------------------------------------------------------- + # nginx-target. Listens on internal :443 with the starter cert. The + # k6 nginx_handshake scenario connects at 100 conns/min for 5 minutes. + # --------------------------------------------------------------------------- + nginx-target: + image: nginx:1.27-alpine + container_name: certctl-loadtest-nginx + depends_on: + target-tls-init: + condition: service_completed_successfully + volumes: + - ./fixtures/target-certs:/etc/nginx/certs:ro + - ./fixtures/nginx.conf:/etc/nginx/nginx.conf:ro + healthcheck: + test: ["CMD-SHELL", "wget -q --no-check-certificate -O- https://localhost:443/ || exit 1"] + interval: 5s + timeout: 3s + retries: 20 + start_period: 15s + + # --------------------------------------------------------------------------- + # apache-target. Listens on internal :443. The bundled httpd.conf loads + # the minimum module set + a single SSL-terminated vhost. + # --------------------------------------------------------------------------- + apache-target: + image: httpd:2.4-alpine + container_name: certctl-loadtest-apache + depends_on: + target-tls-init: + condition: service_completed_successfully + volumes: + - ./fixtures/target-certs:/usr/local/apache2/conf/certs:ro + - ./fixtures/httpd.conf:/usr/local/apache2/conf/httpd.conf:ro + healthcheck: + test: ["CMD-SHELL", "wget -q --no-check-certificate -O- https://localhost:443/ || exit 1"] + interval: 5s + timeout: 3s + retries: 20 + start_period: 15s + + # --------------------------------------------------------------------------- + # haproxy-target. Listens on internal :443 with SSL termination. The + # haproxy.cfg references /usr/local/etc/haproxy/certs/target.pem which + # target-tls-init writes (cert + key concatenated). + # --------------------------------------------------------------------------- + haproxy-target: + image: haproxy:2.9-alpine + container_name: certctl-loadtest-haproxy + depends_on: + target-tls-init: + condition: service_completed_successfully + volumes: + - ./fixtures/target-certs:/usr/local/etc/haproxy/certs:ro + - ./fixtures/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro + healthcheck: + # HAProxy doesn't ship with wget/curl; use the openssl-based handshake + # check instead. The /dev/null redirect drops the response body so + # large logs don't accumulate over the run. + test: ["CMD-SHELL", "echo Q | openssl s_client -connect localhost:443 -servername localhost 2>/dev/null | grep -q 'BEGIN CERTIFICATE'"] + interval: 5s + timeout: 3s + retries: 20 + start_period: 15s + + # --------------------------------------------------------------------------- + # f5-mock target. Re-uses the in-tree f5-mock-icontrol image (already + # used by the deploy-vendor-e2e CI job). Generates its own self-signed + # cert at startup; listens on internal :443 (HTTPS, iControl REST) and + # :8080 (plaintext HTTP). The k6 f5_handshake scenario hits the + # /healthz endpoint. + # --------------------------------------------------------------------------- + f5-mock-target: + build: ../f5-mock-icontrol + container_name: certctl-loadtest-f5-mock + healthcheck: + test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/healthz || exit 1"] + interval: 5s + timeout: 3s + retries: 20 + start_period: 15s + # --------------------------------------------------------------------------- # k6 driver. Pinned to a specific version so threshold expressions stay # stable across runs. --insecure-skip-tls-verify because the server cert is @@ -149,10 +313,29 @@ services: depends_on: certctl-server: condition: service_healthy + # Bundle 10: wait for the four target sidecars to be healthy before + # firing the connector-tier scenarios. Saves the operator from + # spurious "connection refused" errors during the first ~15s of the + # run while target daemons are coming up. + nginx-target: + condition: service_healthy + apache-target: + condition: service_healthy + haproxy-target: + condition: service_healthy + f5-mock-target: + condition: service_healthy environment: CERTCTL_BASE: https://certctl-server:8443 CERTCTL_TOKEN: load-test-token K6_INSECURE_SKIP_TLS_VERIFY: "true" + # Bundle 10: per-target sidecar URLs the connector-tier scenarios + # connect to. Internal docker-compose DNS — k6 resolves these via + # the default user network's resolver. + NGINX_TARGET_URL: https://nginx-target:443 + APACHE_TARGET_URL: https://apache-target:443 + HAPROXY_TARGET_URL: https://haproxy-target:443 + F5_TARGET_URL: https://f5-mock-target:443 volumes: - ./k6.js:/scripts/k6.js:ro - ./results:/results diff --git a/deploy/test/loadtest/fixtures/haproxy.cfg b/deploy/test/loadtest/fixtures/haproxy.cfg new file mode 100644 index 0000000..1d2df6a --- /dev/null +++ b/deploy/test/loadtest/fixtures/haproxy.cfg @@ -0,0 +1,29 @@ +# HAProxy target sidecar — Bundle 10 of the 2026-05-02 deployment-target audit. +# +# Minimal SSL-terminating config that boots green with the starter cert +# written by target-tls-init. The k6 connector-tier scenarios connect at +# sustained 100 conns/min and measure handshake-completion latency. + +global + log stdout local0 warning + maxconn 4096 + # Bundle 10: starter cert+key live at /usr/local/etc/haproxy/certs/. + # HAProxy expects a SINGLE PEM file containing cert + key concatenated; + # the target-tls-init container writes target.pem in that combined form. + ssl-default-bind-options ssl-min-ver TLSv1.2 + +defaults + log global + mode http + option dontlognull + timeout connect 5s + timeout client 30s + timeout server 30s + +frontend https-in + bind *:443 ssl crt /usr/local/etc/haproxy/certs/target.pem + default_backend ok + +backend ok + # Static 200 OK — handshake-only loadtest doesn't exercise the backend. + http-request return status 200 content-type text/plain string "ok\n" diff --git a/deploy/test/loadtest/fixtures/httpd.conf b/deploy/test/loadtest/fixtures/httpd.conf new file mode 100644 index 0000000..cdbb08e --- /dev/null +++ b/deploy/test/loadtest/fixtures/httpd.conf @@ -0,0 +1,66 @@ +# Apache httpd target sidecar — Bundle 10 of the 2026-05-02 deployment-target audit. +# +# Self-contained httpd.conf that the httpd:2.4-alpine image will use as its +# main configuration. Loads the minimum module set required for an HTTPS +# server + serves a single SSL-enabled vhost backed by the starter cert +# written by target-tls-init. + +ServerRoot "/usr/local/apache2" +Listen 443 + +# Module set is the minimum required for the SSL vhost below + the +# directives Apache parses elsewhere in its bootstrap. +LoadModule mpm_event_module modules/mod_mpm_event.so +LoadModule authn_file_module modules/mod_authn_file.so +LoadModule authn_core_module modules/mod_authn_core.so +LoadModule authz_host_module modules/mod_authz_host.so +LoadModule authz_user_module modules/mod_authz_user.so +LoadModule authz_core_module modules/mod_authz_core.so +LoadModule access_compat_module modules/mod_access_compat.so +LoadModule auth_basic_module modules/mod_auth_basic.so +LoadModule reqtimeout_module modules/mod_reqtimeout.so +LoadModule filter_module modules/mod_filter.so +LoadModule mime_module modules/mod_mime.so +LoadModule log_config_module modules/mod_log_config.so +LoadModule env_module modules/mod_env.so +LoadModule headers_module modules/mod_headers.so +LoadModule setenvif_module modules/mod_setenvif.so +LoadModule version_module modules/mod_version.so +LoadModule unixd_module modules/mod_unixd.so +LoadModule dir_module modules/mod_dir.so +LoadModule alias_module modules/mod_alias.so +LoadModule socache_shmcb_module modules/mod_socache_shmcb.so +LoadModule ssl_module modules/mod_ssl.so + +User daemon +Group daemon + +ServerName apache-target +ServerAdmin loadtest@certctl.local + +# Quiet log so the run log stays diff-able. Errors still go to stderr +# (/proc/self/fd/2) so docker compose logs surfaces them on startup +# failure. +ErrorLog /proc/self/fd/2 +LogLevel warn + +DocumentRoot "/usr/local/apache2/htdocs" + +# Bundle 10: starter cert+key from target-tls-init's shared volume. +SSLEngine On +SSLCertificateFile /usr/local/apache2/conf/certs/target.crt +SSLCertificateKeyFile /usr/local/apache2/conf/certs/target.key +SSLProtocol all -SSLv3 -TLSv1 -TLSv1.1 +SSLCipherSuite HIGH:!aNULL:!MD5 +SSLHonorCipherOrder on + + + AllowOverride None + Require all granted + + +# Quiet response — the loadtest scenarios only care that the handshake +# completes. The body content is irrelevant. + + Require all granted + diff --git a/deploy/test/loadtest/fixtures/nginx.conf b/deploy/test/loadtest/fixtures/nginx.conf new file mode 100644 index 0000000..4a59659 --- /dev/null +++ b/deploy/test/loadtest/fixtures/nginx.conf @@ -0,0 +1,36 @@ +# nginx target sidecar — Bundle 10 of the 2026-05-02 deployment-target audit. +# +# Minimal HTTPS-only config that boots green with a starter cert from the +# shared target-tls-init container. The k6 connector-tier scenarios connect +# at sustained 100 conns/min and measure handshake-completion latency. +# Production NGINX configs are far richer; this is a load-test fixture, not +# a deployment template. + +worker_processes 1; +events { + worker_connections 1024; +} + +http { + # Quiet log so the loadtest run doesn't fill the docker-compose log. + access_log off; + error_log /var/log/nginx/error.log warn; + + server { + listen 443 ssl; + server_name _; + + # Bundle 10: starter cert+key written by target-tls-init into the + # shared volume. Not the deployed cert; this is what makes the + # daemon boot green so the loadtest scenarios have something to + # handshake against. + ssl_certificate /etc/nginx/certs/target.crt; + ssl_certificate_key /etc/nginx/certs/target.key; + ssl_protocols TLSv1.2 TLSv1.3; + + location / { + return 200 "ok\n"; + add_header Content-Type text/plain; + } + } +} diff --git a/deploy/test/loadtest/k6.js b/deploy/test/loadtest/k6.js index fd30d58..036f0b9 100644 --- a/deploy/test/loadtest/k6.js +++ b/deploy/test/loadtest/k6.js @@ -1,37 +1,67 @@ // certctl load-test driver — k6 v0.54+ JS API. // -// Closes the #8 acquisition-readiness blocker from the 2026-05-01 issuer -// coverage audit. Pre-fix, certctl had no benchmarks or load tests for any -// API path. An acquirer evaluating "can certctl handle our 50k-cert fleet -// at 47-day rotation" had nothing to point at; this script gives them -// a reproducible number with a methodology. +// Two tiers of scenarios: // -// What this measures (be honest about scope): +// API tier (issuer-coverage audit fix #8, 2026-05-01): +// - issuance_acceptance: POST /api/v1/certificates throughput. +// - list_certificates: GET /api/v1/certificates throughput. +// +// Connector tier (Bundle 10 of the deployment-target audit, 2026-05-02): +// - nginx_handshake / apache_handshake / haproxy_handshake / f5_handshake: +// per-target-type TCP+TLS handshake throughput against the four +// target sidecars at sustained 100 conns/min for 5 minutes. Latency +// is tagged by target_type so summary.json's connector_tier section +// breaks out p50/p95/p99 per target. +// +// What the API tier measures (be honest about scope): // - POST /api/v1/certificates: auth + JSON decode + validation + service // CreateCertificate + DB insert + response. This is the operator-facing // request-acceptance throughput. The downstream issuer-connector call // happens asynchronously via the renewal scheduler (and is bounded -// separately via CERTCTL_RENEWAL_CONCURRENCY — audit fix #9). +// separately via CERTCTL_RENEWAL_CONCURRENCY — issuer audit fix #9). // - GET /api/v1/certificates: read path with pagination. Exercises the // cert list query, which is the most-called read endpoint in any UI/ // automation client. // -// What this does NOT measure: +// What the connector tier measures: +// - Per-target-type TCP+TLS handshake completion latency. Validates that +// each target sidecar (nginx, apache, haproxy, f5-mock) is operational +// and serving its starter cert under sustained connection load. +// Procurement asks "can certctl's nginx target handle 5,000 endpoints +// at 47-day rotation"; the answer requires (a) the connector code +// handles deploys correctly (covered by per-connector unit tests) AND +// (b) the underlying daemon serves TLS at the connection rates a +// 5,000-endpoint fleet implies. The connector-tier scenarios pin (b). +// +// What this does NOT measure (documented limits, not lazy gaps): // - Issuer connector latency (DigiCert / ACME / Vault / etc. round-trips // to upstream CAs). Those are async; pin via the per-issuer-type -// metrics instead (audit fix #4: certctl_issuance_duration_seconds). -// - The full ACME enrollment flow (newOrder → challenge → finalize). -// The audit prompt mentioned ACME-via-pebble; deferred to a follow-up -// because driving multi-RTT ACME flows at sustained 100/s requires -// pebble tuning + k6 crypto helpers that don't exist out of the box. +// metrics instead (issuer audit fix #4: +// certctl_issuance_duration_seconds). +// - Full ACME enrollment (newOrder → challenge → finalize). +// - The full agent-driven deploy hot path (POST cert with target +// binding → poll deployments endpoint → verify served cert matches). +// v1 of the connector-tier harness measures handshake throughput +// against the sidecars directly. v2 is a follow-up that needs the +// agent registration + target-binding API surface plumbed end-to-end +// in the loadtest stack — a meaningful addition but not a blocker +// for the Bundle 10 procurement question. +// - Kubernetes connector. kind-in-docker requires `privileged: true` +// and is operationally fragile in CI. Deferred until Bundle 2 (real +// k8s.io/client-go) lands. // -// Threshold contract: any future change that pushes p99 above 5s for the -// issuance-acceptance scenario or 2s for the read scenario, OR any change -// that pushes the error rate above 1%, fails the test. CI gates the run -// behind workflow_dispatch + cron (NOT per-push — load tests are too slow -// to gate per-PR signal). +// Threshold contract: +// - API tier: p99 < 5s for issuance, < 2s for list, error rate < 1%. +// - Connector tier: p99 < 3s per handshake target (5s for f5-mock, +// iControl REST is slower), error rate < 1%. +// Any change pushing past these fails the workflow. // -// Audit reference: cowork/issuer-coverage-audit-2026-05-01/RESULTS.md fix #8. +// CI gates the run behind workflow_dispatch + cron (NOT per-push — load +// tests are too slow to gate per-PR signal). +// +// Audit references: +// - API tier: cowork/issuer-coverage-audit-2026-05-01/RESULTS.md fix #8. +// - Connector tier: cowork/deployment-target-audit-2026-05-02/RESULTS.md Bundle 10. import http from 'k6/http'; import { check } from 'k6'; @@ -43,6 +73,18 @@ import { textSummary } from 'https://jslib.k6.io/k6-summary/0.0.2/index.js'; const BASE = __ENV.CERTCTL_BASE || 'https://localhost:8443'; const TOKEN = __ENV.CERTCTL_TOKEN || 'load-test-token'; +// Bundle 10: per-target sidecar URLs. Defaults match the docker-compose +// stack's internal DNS; operators running k6 manually against a different +// stack override these via env. Empty default → the corresponding +// scenario is skipped (the scenarioFor* helper guards). +const NGINX_TARGET_URL = __ENV.NGINX_TARGET_URL || 'https://nginx-target:443'; +const APACHE_TARGET_URL = __ENV.APACHE_TARGET_URL || 'https://apache-target:443'; +const HAPROXY_TARGET_URL = __ENV.HAPROXY_TARGET_URL || 'https://haproxy-target:443'; +// f5-mock's iControl REST `/healthz` endpoint is the CI-friendly +// per-handshake probe — hits the path the F5 connector itself uses for +// reachability. Real F5 BIG-IP also exposes /healthz under /mgmt/. +const F5_TARGET_URL = __ENV.F5_TARGET_URL || 'https://f5-mock-target:443'; + // Demo seed (CERTCTL_DEMO_SEED=true) creates these rows; CreateCertificate // requires all four FKs to exist. Pre-baked here so the script has zero // dependency on test fixtures beyond the seed. @@ -82,18 +124,75 @@ export const options = { startTime: '5s', tags: { scenario: 'list_certificates' }, }, + + // Bundle 10: connector-tier per-target-type handshake scenarios. + // 100 conns/min sustained for 5 minutes against each sidecar. + // The handshake measurement captures TCP connect + TLS + // handshake + tiny HTTP GET (`/` for nginx/apache/haproxy, + // `/healthz` for f5-mock); k6's http_req_duration aggregates + // all three so the numbers are end-to-end "respond to the + // operator's connection" latency, not isolated TLS-handshake + // microseconds. + nginx_handshake: { + executor: 'constant-arrival-rate', + rate: 100, + timeUnit: '1m', + duration: '5m', + preAllocatedVUs: 10, + maxVUs: 50, + exec: 'nginxHandshake', + startTime: '10s', + tags: { scenario: 'nginx_handshake', target_type: 'nginx' }, + }, + apache_handshake: { + executor: 'constant-arrival-rate', + rate: 100, + timeUnit: '1m', + duration: '5m', + preAllocatedVUs: 10, + maxVUs: 50, + exec: 'apacheHandshake', + startTime: '10s', + tags: { scenario: 'apache_handshake', target_type: 'apache' }, + }, + haproxy_handshake: { + executor: 'constant-arrival-rate', + rate: 100, + timeUnit: '1m', + duration: '5m', + preAllocatedVUs: 10, + maxVUs: 50, + exec: 'haproxyHandshake', + startTime: '10s', + tags: { scenario: 'haproxy_handshake', target_type: 'haproxy' }, + }, + f5_handshake: { + executor: 'constant-arrival-rate', + rate: 100, + timeUnit: '1m', + duration: '5m', + preAllocatedVUs: 10, + maxVUs: 50, + exec: 'f5Handshake', + startTime: '10s', + tags: { scenario: 'f5_handshake', target_type: 'f5' }, + }, }, thresholds: { - // Hard floor: 99% of issuance-acceptance requests complete in - // under 5 seconds. Pre-fix this was unsubstantiated; post-fix - // this is the regression guard. The number isn't aspirational — - // it's the worst-acceptable user-facing API SLO from the - // operator perspective. + // API tier — issuer audit fix #8. 'http_req_duration{scenario:issuance_acceptance}': ['p(99)<5000', 'p(95)<2000'], 'http_req_duration{scenario:list_certificates}': ['p(99)<2000', 'p(95)<800'], - // < 1% error rate. The k6 default is "any 4xx/5xx counts as - // failed"; legitimate 201/200 responses don't count. Auth - // failures, validation failures, server errors all do. + + // Bundle 10 connector tier. nginx/apache/haproxy are pure TLS + // termination → tight thresholds. f5-mock includes a tiny Go + // server response on top of the handshake → slightly looser. + 'http_req_duration{target_type:nginx}': ['p(99)<3000', 'p(95)<1000'], + 'http_req_duration{target_type:apache}': ['p(99)<3000', 'p(95)<1000'], + 'http_req_duration{target_type:haproxy}': ['p(99)<3000', 'p(95)<1000'], + 'http_req_duration{target_type:f5}': ['p(99)<5000', 'p(95)<1500'], + + // < 1% error rate across ALL scenarios. Auth failures, validation + // failures, server errors, connection refused all count. 'http_req_failed': ['rate<0.01'], }, // Smaller summary payload — strip per-VU metrics we don't read. @@ -148,16 +247,109 @@ export function listCertificates() { }); } +// --- Bundle 10: connector-tier handshake scenarios --- +// +// Each per-target function does a single HTTPS GET against its target +// sidecar. k6's http_req_duration metric captures TCP connect + TLS +// handshake + HTTP request/response — that's the end-to-end "connection +// readiness" latency a deploy connector cares about. The target_type +// tag groups results in summary.json's connector_tier section. +// +// Status-check threshold: any 4xx/5xx counts as failed (k6 default +// behaviour for http_req_failed). f5-mock's /healthz returns 200; the +// other three nginx/apache/haproxy default vhost configs all return +// 200 on `/`. +// +// Bundle 10 of the 2026-05-02 deployment-target audit. + +export function nginxHandshake() { + const res = http.get(`${NGINX_TARGET_URL}/`, { + tags: { scenario: 'nginx_handshake', target_type: 'nginx' }, + }); + check(res, { + 'nginx 2xx': (r) => r.status >= 200 && r.status < 300, + }); +} + +export function apacheHandshake() { + const res = http.get(`${APACHE_TARGET_URL}/`, { + tags: { scenario: 'apache_handshake', target_type: 'apache' }, + }); + check(res, { + 'apache 2xx': (r) => r.status >= 200 && r.status < 300, + }); +} + +export function haproxyHandshake() { + const res = http.get(`${HAPROXY_TARGET_URL}/`, { + tags: { scenario: 'haproxy_handshake', target_type: 'haproxy' }, + }); + check(res, { + 'haproxy 2xx': (r) => r.status >= 200 && r.status < 300, + }); +} + +export function f5Handshake() { + const res = http.get(`${F5_TARGET_URL}/healthz`, { + tags: { scenario: 'f5_handshake', target_type: 'f5' }, + }); + check(res, { + 'f5 2xx': (r) => r.status >= 200 && r.status < 300, + }); +} + // handleSummary writes the full results to /results/summary.{json,txt} // so the operator can commit the baseline numbers into README.md after // each run and so CI can ingest the JSON for diffing. // +// Bundle 10 added a `connector_tier` aggregation alongside the API tier +// — same source data (data.metrics), grouped by target_type tag for +// per-connector-type p50/p95/p99/error breakdowns. Operators tracking a +// connector regression diff `connector_tier.` between runs. +// // stdout reproduces the textSummary so the docker compose log shows // the same numbers an operator running it manually would see. export function handleSummary(data) { + const enriched = enrichWithConnectorTier(data); return { - '/results/summary.json': JSON.stringify(data, null, 2), + '/results/summary.json': JSON.stringify(enriched, null, 2), '/results/summary.txt': textSummary(data, { indent: ' ', enableColors: false }), stdout: textSummary(data, { indent: ' ', enableColors: true }), }; } + +// enrichWithConnectorTier appends a connector_tier object to the k6 +// summary data. Each target_type entry contains: +// { p50, p95, p99, max, avg, error_rate, iterations } +// Missing tags (e.g. an operator runs only the API tier scenarios) are +// reported as null so callers can detect them without a separate scan. +function enrichWithConnectorTier(data) { + const targetTypes = ['nginx', 'apache', 'haproxy', 'f5']; + const connectorTier = {}; + for (const t of targetTypes) { + const reqDurKey = `http_req_duration{target_type:${t}}`; + const reqFailKey = `http_req_failed{target_type:${t}}`; + const iterKey = `iterations{target_type:${t}}`; + + const dur = data.metrics[reqDurKey]; + const fail = data.metrics[reqFailKey]; + const iters = data.metrics[iterKey]; + + if (!dur || !dur.values) { + connectorTier[t] = null; + continue; + } + connectorTier[t] = { + p50: dur.values['med'] ?? null, + p95: dur.values['p(95)'] ?? null, + p99: dur.values['p(99)'] ?? null, + max: dur.values['max'] ?? null, + avg: dur.values['avg'] ?? null, + error_rate: fail && fail.values ? (fail.values['rate'] ?? null) : null, + iterations: iters && iters.values ? (iters.values['count'] ?? null) : null, + }; + } + // Shallow-merge so existing summary fields (data.metrics, data.options, + // etc.) stay untouched. The connector_tier key is additive. + return Object.assign({}, data, { connector_tier: connectorTier }); +}