diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5294a6e..1858a01 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -137,52 +137,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} run: bash scripts/coverage-pr-comment.sh - # Bundle P / Strengthening #6 — QA-doc seed-count drift guard. Forces - # every PR that adds a seed row to migrations/seed_demo.sql to keep - # docs/contributor/qa-test-suite.md::Seed Data Reference in sync. - # - # Phase 5 of the 2026-05-04 docs overhaul (commit c64777f) deleted - # docs/testing-guide.md (its content dispersed across the new - # audience-organized doc tree); the previous QA-doc Part-count drift - # guard tracked Part counts between testing-guide.md and the old - # qa-test-guide.md headline. With testing-guide.md gone, that guard's - # premise is dead and it has been removed. The seed-count drift class - # is still live: qa-test-suite.md::Seed Data Reference enumerates - # certs/issuers and seed_demo.sql is the source of truth. - - name: QA-doc seed-count drift guard - run: | - set -e - DOC=docs/contributor/qa-test-suite.md - # Seed-cert count: agnostic to documented header format. The current - # documented count lives in `### Certificates (32 total in ...` — - # extract the first integer in that header. - DOC_CERTS=$(grep -oE '### Certificates \([0-9]+' "$DOC" | grep -oE '[0-9]+' | head -1) - # Authoritative count: unique mc-* IDs in seed_demo.sql. - SEED_CERTS=$(grep -oE 'mc-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') - if [ -z "$DOC_CERTS" ]; then - echo "::warning::Could not extract documented cert count from $DOC." - echo " Skipping cert-count drift check (header format may have changed)." - elif [ "$DOC_CERTS" != "$SEED_CERTS" ]; then - echo "::error::DRIFT — $DOC says $DOC_CERTS certs; seed_demo.sql has $SEED_CERTS unique mc-* IDs." - echo " Update $DOC::Seed Data Reference to match." - exit 1 - fi - # Issuers: seed-table count vs doc claim. - DOC_ISS=$(grep -oE '### Issuers \([0-9]+' "$DOC" | grep -oE '[0-9]+' | head -1) - # Authoritative: unique iss-* IDs (close enough proxy; the issuers - # table count IS the unique-ID count for this prefix). - SEED_ISS=$(grep -oE 'iss-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') - if [ -z "$DOC_ISS" ]; then - echo "::warning::Could not extract documented issuer count." - elif [ "$DOC_ISS" != "$SEED_ISS" ] && [ "$((SEED_ISS - DOC_ISS))" -gt 5 ]; then - # Allow up to 5pp slack — iss-* IDs appear in audit_events and - # other reference tables that aren't issuer-table rows. Drift - # only flags when the spread grows large. - echo "::error::DRIFT — $DOC says $DOC_ISS issuers; seed_demo.sql has $SEED_ISS unique iss-* IDs (spread > 5)." - exit 1 - fi - echo "QA-doc seed-count drift guard: clean." - # Bundle Q / I-001 closure — test-naming convention guard (informational). # The convention is `Test__`. This step # prints any non-conformant tests but does NOT fail the build until the @@ -199,9 +153,8 @@ jobs: # internal scenarios expressed via `t.Run` subtests. Requiring the # underscore-Scenario-Result triple repo-wide would mean renaming # 167 legitimate tests for no observable behavior change. The - # Test__ form remains documented as - # the recommended pattern for parameterized scenarios in - # docs/contributor/qa-test-suite.md, but is not gated. + # Test__ form remains the + # recommended pattern for parameterized scenarios, but is not gated. - name: Regression guards (extracted to scripts/ci-guards/) # All named regression guards live at scripts/ci-guards/.sh per # ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally: diff --git a/Makefile b/Makefile index 2393325..cf64fca 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help build run test lint verify verify-docs verify-deploy loadtest acme-cert-manager-test acme-rfc-conformance-test keycloak-integration-test okta-smoke-test benchmark-auth benchmark-auth-coldcache clean docker-up docker-down migrate-up migrate-down generate test-cover frontend-build qa-stats +.PHONY: help build run test lint verify verify-deploy loadtest acme-cert-manager-test acme-rfc-conformance-test keycloak-integration-test okta-smoke-test benchmark-auth benchmark-auth-coldcache clean docker-up docker-down migrate-up migrate-down generate test-cover frontend-build qa-stats # Default target - show help help: @@ -16,7 +16,6 @@ help: @echo " make lint Run linter (golangci-lint)" @echo " make fmt Format code with gofmt" @echo " make verify Pre-commit gate: fmt + vet + lint + test (CI-parity)" - @echo " make verify-docs Pre-tag gate: QA-doc drift checks (operator-facing docs)" @echo " make verify-deploy Pre-push gate: digest validity + OpenAPI parity + docker build smoke" @echo " make loadtest k6 throughput run against postgres + certctl (NOT in verify; manual + cron only)" @echo "" @@ -119,23 +118,6 @@ verify: @echo "" @echo "verify: PASS — safe to commit" -# verify-docs: pre-tag gate. Runs the QA-doc seed-count drift guard -# that ci-pipeline-cleanup Phase 11 / frozen decision 0.13 moved out -# of CI (was per-push blocking; now operator-runs pre-tag). Protects -# docs/contributor/qa-test-suite.md::Seed Data Reference from -# drifting vs migrations/seed_demo.sql. Operator-facing docs only — -# not product-affecting. -# -# The QA-doc Part-count drift guard retired in the 2026-05-04 docs -# overhaul Phase 5 when docs/testing-guide.md was pruned (its content -# dispersed across the audience-organized doc tree); the Part-count -# class no longer exists outside the qa_test.go file itself. -verify-docs: - @echo "==> QA-doc seed-count drift" - @bash scripts/qa-doc-seed-count.sh - @echo "" - @echo "verify-docs: PASS — safe to tag" - # verify-deploy: optional pre-push gate. Runs the digest-validity check, # the OpenAPI ↔ handler parity check, and a Docker build smoke for the # production images (server + agent only — fast subset for local; CI @@ -313,13 +295,10 @@ frontend-build: cd web && npm ci && npx vite build @echo "Frontend build complete" -# QA Suite Stats — Bundle P / Strengthening #8. -# Single source-of-truth for every count claim in -# docs/contributor/qa-test-suite.md. The Strengthening #6 CI drift guards -# (now scoped to the seed-count class only — the Part-count class retired -# in the 2026-05-04 docs overhaul Phase 5 when testing-guide.md was -# pruned) consume the same numbers, eliminating the doc-drift class -# structurally. +# qa-stats: snapshot of the test-suite size at the current commit. +# Backend Go tests + subtests + fuzz targets + skipped sites, plus the +# seed-data counts in migrations/seed_demo.sql. Useful before a release +# to spot-check that no whole layer dropped off. qa-stats: @echo "=== certctl QA Suite Stats ===" @echo "Date: $$(date +%Y-%m-%d)" diff --git a/README.md b/README.md index 7546460..647ee35 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,6 @@ The full audience-organized index lives at [`docs/README.md`](docs/README.md). T | Production operator | [Architecture](docs/reference/architecture.md) → [Security posture](docs/operator/security.md) → [Disaster recovery runbook](docs/operator/runbooks/disaster-recovery.md) | | PKI engineer | [ACME server](docs/reference/protocols/acme-server.md) → [SCEP server](docs/reference/protocols/scep-server.md) → [EST server](docs/reference/protocols/est.md) → [CA hierarchy](docs/reference/intermediate-ca-hierarchy.md) | | Migrating from another tool | [from certbot](docs/migration/from-certbot.md) / [from acme.sh](docs/migration/from-acmesh.md) / [cert-manager coexistence](docs/migration/cert-manager-coexistence.md) | -| Contributor | [Architecture](docs/reference/architecture.md) → [Testing strategy](docs/contributor/testing-strategy.md) → [CI pipeline](docs/contributor/ci-pipeline.md) | For the connector reference (12 issuers, 15 targets, 6 notifiers) see [`docs/reference/connectors/index.md`](docs/reference/connectors/index.md). @@ -175,8 +174,6 @@ make docker-up # Start Docker Compose stack CI runs `go vet`, `go test -race`, `golangci-lint`, `govulncheck`, and per-package coverage thresholds (service 70%, handler 75%, crypto 88%, auth packages 85-95%) on every push. The thresholds-as-data file is `.github/coverage-thresholds.yml`; lowering a floor requires corresponding test work, not a config flip. Frontend CI runs TypeScript type checking, Vitest tests, and Vite production build. -For the full contributor guide see [`docs/contributor/`](docs/contributor/) — testing strategy, test environment, CI pipeline, QA prerequisites. - ## License Licensed under the [Business Source License 1.1](LICENSE). The source code is publicly available and free to use, modify, and self-host. The one restriction: you may not use certctl's certificate management functionality as part of a commercial certificate-management offering to third parties. See the LICENSE file for the full Additional Use Grant. diff --git a/deploy/ENVIRONMENTS.md b/deploy/ENVIRONMENTS.md index b703be6..f05ff4c 100644 --- a/deploy/ENVIRONMENTS.md +++ b/deploy/ENVIRONMENTS.md @@ -440,6 +440,7 @@ Every `CERTCTL_*` environment variable is read by the server's `internal/config/ | `CERTCTL_ACME_CHALLENGE_TYPE` | `http-01`, `dns-01`, or `dns-persist-01` | | `CERTCTL_ACME_INSECURE` | Skip TLS verification for ACME CA (test only) | | `CERTCTL_ACME_EAB_KID` / `CERTCTL_ACME_EAB_HMAC` | External Account Binding for ZeroSSL, Google Trust Services | +| `CERTCTL_ZEROSSL_EAB_URL` | Override the ZeroSSL EAB-credentials endpoint (defaults to the public ZeroSSL URL; only set for ZeroSSL staging or a private mirror) | | `CERTCTL_ACME_ARI_ENABLED` | Enable RFC 9773 Renewal Information | | `CERTCTL_ACME_PROFILE` | ACME profile (`tlsserver`, `shortlived`) | | `CERTCTL_STEPCA_URL` | step-ca server URL | diff --git a/docs/contributor/ci-guards.md b/docs/contributor/ci-guards.md deleted file mode 100644 index 99aabd2..0000000 --- a/docs/contributor/ci-guards.md +++ /dev/null @@ -1,85 +0,0 @@ -# CI guards - -> Last reviewed: 2026-05-12 - -CI guards are small scripts (shell + Python) and Go tests that pin invariants the v2 audit history showed are easy to lose. Each one runs on every push, fails the build on regression with a useful error message, and produces no output on the happy path. The canonical source is `scripts/ci-guards/` for shell guards and `internal/ciparity/` for Go-based parity tests. - -This page lives at `docs/contributor/ci-guards.md` and is the entry point for contributors who want to understand why a CI step is red, how to add a new guard, or where the allowlist for a given guard lives. The exhaustive list of shell guards is at `scripts/ci-guards/README.md`; this doc explains the categories + the discipline. - -## Why guards exist - -Two failure modes the v2 audit cycle surfaced repeatedly: - -The codebase grew faster than the docs and config could keep up. Env vars got added without consumers; OpenAPI ops were registered without router routes; docs went stale; a migration broke on cold-DB without any test catching it. Each one of those classes has a one-time-fix _per-instance_ pattern (re-read the doc, wire the env var) and a structural _per-class_ pattern (write a guard that fails the next time it happens). CI guards are the second. - -The team grew. Reviewers had to remember what each commit author had forgotten. CI guards externalize the institutional knowledge into checks — the build refuses to ship the lying field, the stale doc, the broken migration. New contributors don't need to know the audit history. - -## Categories - -The guards fall into four buckets, organized by what they pin: - -### Code-shape guards - -Catch defects in source files BEFORE they ship. Examples: `G-3-env-docs-drift.sh` (no env var defined-but-undocumented or documented-but-undefined), `complete-path-config-coverage.sh` (every env var has a non-config consumer), `T-1-frontend-page-coverage.sh` (every new GUI page has a sibling test file). - -### Contract-parity guards - -Catch drift across the four product surfaces — OpenAPI spec, HTTP router, MCP tool catalogue, CLI verb dispatcher. The router ↔ OpenAPI pin lives at `internal/api/router/openapi_parity_test.go::TestRouter_OpenAPIParity`. The MCP + CLI sweep lives at `internal/ciparity/surface_parity_test.go` (post-v2.1.0 anti-rot item 2). One hard gate: the MCP tool count cannot regress below `mcpBaselineFloor`. The CLI parity sweep is informational until the CLI surface stabilizes. - -### Build / dependency guards - -`H-001-bare-from.sh` (Dockerfile pin to `@sha256:`), `digest-validity.sh` (every digest actually resolves on the registry), `M-012-no-root-user.sh` (no Dockerfile ends as root), `bundle-8-*.sh` (frontend XSS / reverse-tabnabbing surface). These come out of specific audits and pin the closure. - -### Operational guards - -`doc-rot-detector.sh` (every doc reviewed within 120 days) pins the operational reality, not the source shape. - -The cold-DB compose smoke (wipe postgres volume, bring stack up cold, issue/renew/revoke, audit-row check) lives directly in `.github/workflows/ci.yml::cold-db-compose-smoke` — not as a script. It is intentionally not operator-runnable: the gate's value is that CI owns the cold-DB state, the operator never has to remember to run it. Master branch-protection enforces the job as a required check; that is the manual action, and it happens once. - -## When the build is red - -Find the failing step in the GitHub Actions UI. Every guard's output starts with the guard's own identifier and ends with one of: - -`::error::` followed by 2-4 remediation paths. The fastest path: read the remediation list, pick the option that fits, fix. - -`exit 1` without an `::error::` annotation — likely an `set -e` trap on an internal command. Re-run with `bash -x scripts/ci-guards/.sh` locally to see where it died. - -If a guard is fundamentally wrong (e.g., refactor moved the code it scans), update the guard in the same PR that triggered the failure. Don't add a one-off allowlist to silence a real bug. - -## Adding a new guard - -The discipline in five steps. The first three are non-negotiable; the last two are courtesy. - -Drop a new `.sh` in `scripts/ci-guards/` with a head-comment block that names the bug class, lists the audit finding (if any) it closes, and explains the failure mode. Mirror the shape of an existing guard — `G-3-env-docs-drift.sh` and `digest-validity.sh` are the canonical bash+Python and pure-bash examples. - -Use `set -e` early; use `::error::` annotations on regression; exit 0 with one happy-path confirmation line. Take no arguments, require no env vars. The CI loop iterates every `*.sh` without args. - -Write the allowlist file alongside (`-exceptions.yaml`) with the shape `- path: ... / - name: ... + justification + expires`. Make `expires` a required field — every exception has a hard expiration date, typically 90 days out. - -Verify on a deliberately broken state: introduce the regression, confirm the guard fires with a useful message, revert, confirm green. Capture the negative-test output in your PR description. - -Add a row to `scripts/ci-guards/README.md`. The CI loop auto-picks up the new file — no `ci.yml` edit required, unless the guard needs Docker (in which case it gets its own dedicated job; see `cold-db-compose-smoke` for the pattern). - -## Discipline: the allowlist trap - -Allowlists are dangerous. They start as a small concession ("this one env var is documented for an external script, not consumed by Go code") and become a junk drawer of unverified exemptions that mask real defects. The discipline that keeps that from happening: - -Every entry MUST carry a `justification:` field with a one-line reason. "Tech debt" is not a reason; "documented contract surface consumed by the ACME DNS-01 helper script — see `deploy/test/acme/dns01-export.sh`" is. - -Every entry MUST carry an `expires:` field with a hard date, typically 90 days out. The guards reject entries past their expiration. When an entry expires, the only paths forward are (a) close the underlying gap so the entry is no longer needed, (b) re-justify with a fresh expiration. Both force a real review. - -If you're adding more than one entry to an allowlist in a single PR, that's a smell — usually the underlying class needs a small refactor, not three allowlist rows. - -## Where the bundles live - -The `Audit-Closes:` commit trailer convention (post-v2.1.0 anti-rot item 4) is the cross-reference between audit findings and the commits that closed them. Re-derive the closure history of any audit with: - - git log --grep='Audit-Closes: ' - -The audit folder structure under `cowork/` (workspace-local; not in this repo) carries the per-audit RESULTS.md + findings.yaml. CLAUDE.md's "Audit closures" subsection is the current-state index of which audits are open vs closed. - -## Related - -The exhaustive guard list — `scripts/ci-guards/README.md`. -The CI pipeline architecture — `docs/contributor/ci-pipeline.md`. -The QA test suite — `docs/contributor/qa-test-suite.md`. diff --git a/docs/contributor/ci-pipeline.md b/docs/contributor/ci-pipeline.md deleted file mode 100644 index bc4c183..0000000 --- a/docs/contributor/ci-pipeline.md +++ /dev/null @@ -1,232 +0,0 @@ -# CI Pipeline — Operator Guide - -> Last reviewed: 2026-05-12 - -> Authoritative guide to certctl's CI pipeline shape. -> Per the ci-pipeline-cleanup spec, Phase 12. - -## Trigger model - -Three triggers, each with its own scope. Don't mix. - -| Trigger | Workflow | Scope | Wall-clock target | -|---|---|---|---| -| Push to master, PR to master | `.github/workflows/ci.yml` + `.github/workflows/codeql.yml` | Blocking — every check earns its keep | <10 min | -| Daily 06:00 UTC + `workflow_dispatch` | `.github/workflows/security-deep-scan.yml` | Slow scans (gosec, osv, trivy, ZAP, schemathesis, nuclei, testssl, semgrep, mutation, `-race -count=10`); best-effort, never blocks | 60 min budget | -| Tag push (`v*`) | `.github/workflows/release.yml` | Cross-platform binaries, ghcr.io push, SLSA provenance, GitHub release | n/a | - -This guide covers the **on-push pipeline** only. - -## On-push pipeline (7 status checks) - -```mermaid -flowchart TD - Push["push to master"] - CI["CI workflow (5 jobs)"] - CodeQL["CodeQL workflow (2 jobs)"] - GoBuild["go-build-and-test
~6-7 min"] - Frontend["frontend-build
~1 min"] - HelmLint["helm-lint
~10 sec"] - Vendor["deploy-vendor-e2e
~5 min, depends on go-build-and-test"] - Image["image-and-supply-chain
~3 min, parallel"] - AnalyzeGo["Analyze (go)
~5 min, parallel"] - AnalyzeJS["Analyze (javascript-typescript)
~5 min, parallel"] - Push --> CI - Push --> CodeQL - CI --> GoBuild - CI --> Frontend - CI --> HelmLint - CI --> Vendor - CI --> Image - CodeQL --> AnalyzeGo - CodeQL --> AnalyzeJS - GoBuild -.depends on.-> Vendor -``` - -End-to-end wall-clock: dominated by `go-build-and-test` + `deploy-vendor-e2e` chain (~12 min) running in parallel with CodeQL (~5 min). Target ~10 min. - -## Per-job deep-dive - -### `go-build-and-test` (Ubuntu, ~6-7 min) - -Runs the Go build/test suite + 18 of 20 regression guards. - -Steps: -1. `actions/checkout@v4` -2. `actions/setup-go@v5` (Go 1.25.10) -3. `go build ./cmd/...` (server, agent, mcp-server, cli) -4. **gofmt drift** — `gofmt -l .` must be empty (Makefile::verify parity) -5. **go mod tidy drift** — `go mod tidy && git diff --exit-code go.mod go.sum` -6. `go vet ./...` -7. Install + run **golangci-lint** v2.11.4 (`--timeout 5m`) -8. Install + run **govulncheck** (hard gate) -9. Install + run **staticcheck** (hard gate; `continue-on-error: false`) -10. **Race Detection** — `go test -race -count=1 ./internal/...` (9-package list, 5min timeout) -11. **Go Test with Coverage** — full coverage profile to `coverage.out` -12. **Check Coverage Thresholds** — `bash scripts/check-coverage-thresholds.sh` (reads `.github/coverage-thresholds.yml`) -13. **Upload Coverage Report** — artifact (`go-coverage`, 30-day retention) -14. **Coverage PR comment** — posts/updates per-PR coverage table (PR builds only) -15. **Regression guards** — loop runs all `scripts/ci-guards/*.sh` (18 of 20 guards) - -Local equivalent: `make verify` covers steps 4, 6, 7, 11 (with `-short`). - -### `frontend-build` (Ubuntu, ~1 min) - -Vitest tests + tsc check + vite build + 2 of 20 regression guards (already covered by the ci-guards loop in `go-build-and-test`). - -Steps: -1. `actions/checkout@v4` -2. `actions/setup-node@v4` (Node 22) -3. `npm ci` -4. `npx tsc --noEmit` -5. `npx vitest run` -6. `npx vite build` -7. **Regression guards** — same `scripts/ci-guards/*.sh` loop as `go-build-and-test` (catches frontend-side guards: S-1, P-1, T-1, L-015, L-019, M-009, G-3) - -### `helm-lint` (Ubuntu, ~10 sec) - -Helm chart validation in 3 modes + inverse fail-loud test: -1. `helm lint` with existingSecret -2. `helm template` (existingSecret mode) -3. `helm template` (cert-manager mode) -4. `helm template` (no TLS source — MUST fail per fail-loud guard) - -### `deploy-vendor-e2e` (Ubuntu, ~5 min, depends on `go-build-and-test`) - -Single-job collapse of the prior 12-job matrix (per ci-pipeline-cleanup Phase 5 / frozen decision 0.4 — revises Bundle II decision 0.9). - -Steps: -1. `actions/checkout@v5` -2. `actions/setup-go@v5` (Go 1.25.10, cache: true) -3. **Build f5-mock-icontrol sidecar** — only sidecar without published image -4. **Bring up all vendor sidecars** — `docker compose --profile deploy-e2e up -d` (11 sidecars) -5. **Run all vendor-edge e2e** — `go test -tags integration -race -count=1 -run 'VendorEdge_'`; output captured to `test-output.log` -6. **Skip-count enforcement** — `bash scripts/ci-guards/vendor-e2e-skip-check.sh test-output.log` (catches sidecar boot failures via skip-count vs allowlist) -7. **Tear down sidecars** — `docker compose down -v` (always runs) - -The `deploy-vendor-e2e-windows` matrix was deleted entirely (per ci-pipeline-cleanup Phase 6 / frozen decision 0.5 — revises Bundle II decision 0.4). IIS + WinCertStore validation moved to [`docs/connector-iis.md::Operator validation playbook`](connector-iis.md#operator-validation-playbook-windows-host). - -### `image-and-supply-chain` (Ubuntu, ~3 min, parallel) - -Three checks bundled (per ci-pipeline-cleanup Phases 7-9 / frozen decision 0.8): -1. **Digest validity** — `bash scripts/ci-guards/digest-validity.sh`. Resolves every `@sha256:` ref in `deploy/**/*.{yml,Dockerfile*}` against its registry. Closes the H-001 lying-field gap. -2. **Docker build smoke** — builds all 4 Dockerfiles (`Dockerfile`, `Dockerfile.agent`, `deploy/test/f5-mock-icontrol/Dockerfile`, `deploy/test/libest/Dockerfile`). -3. **OpenAPI ↔ handler operationId parity** — `bash scripts/ci-guards/openapi-handler-parity.sh`. Every router route must have a matching `operationId` in `api/openapi.yaml` or be documented in `api/openapi-handler-exceptions.yaml`. - -### CodeQL (Ubuntu × 2 languages, ~5 min) - -`.github/workflows/codeql.yml` — interprocedural taint tracking. Two matrix jobs: `go` and `javascript-typescript`. Triggers on push, PR, and weekly Sunday cron. - -## The 20 regression guards - -Located at `scripts/ci-guards/.sh`. Each script is callable locally: - -```bash -bash scripts/ci-guards/G-3-env-docs-drift.sh -``` - -Or run all of them: - -```bash -for g in scripts/ci-guards/*.sh; do - echo "=== $(basename "$g") ===" - bash "$g" || echo " FAILED" -done -``` - -| ID | Catches | -|---|---| -| `G-1-jwt-auth-literal` | JWT silent auth downgrade reappearing | -| `L-001-insecure-skip-verify` | Bare `InsecureSkipVerify: true` without `//nolint:gosec` | -| `H-001-bare-from` | Bare Dockerfile `FROM` without `@sha256:` digest pin | -| `M-012-no-root-user` | Dockerfile missing terminal `USER ` | -| `H-009-readme-jwt` | README re-introducing JWT-as-supported claim | -| `G-2-api-key-hash-json` | `api_key_hash` in JSON-emitting surface | -| `U-2-plaintext-healthcheck` | Plaintext `http://` in HEALTHCHECK | -| `U-3-migration-mount` | Migration file mounted into postgres initdb | -| `D-1-D-2-statusbadge-phantom` | Dead StatusBadge keys + 8 TS phantom fields across 4 interfaces | -| `L-1-bulk-action-loop` | Client-side `for ... await` bulk action loops | -| `B-1-orphan-crud` | 8 update/create/delete fns lose page consumers | -| `S-2-strings-contains-err` | `strings.Contains(err.Error(), ...)` brittle dispatch | -| `G-3-env-docs-drift` | `CERTCTL_*` env var defined OR documented but not both | -| `test-naming-convention` | `func TestXxx` lowercase first letter (Go silently skips) | -| `S-1-hardcoded-source-counts` | Hardcoded "N issuer connectors" prose | -| `P-1-documented-orphan-fns` | 16 read-fn names removed from client.ts exports | -| `T-1-frontend-page-coverage` | New page in `web/src/pages/` without sibling `.test.tsx` | -| `bundle-8-L-015-target-blank-rel-noopener` | `target="_blank"` without `rel="noopener noreferrer"` | -| `bundle-8-L-019-dangerously-set-inner-html` | `dangerouslySetInnerHTML` outside `safeHtml.ts` | -| `bundle-8-M-009-bare-usemutation` | Bare `useMutation()` outside the `useTrackedMutation` wrapper | - -Plus three additional scripts for non-guard operator workflows: -- `scripts/ci-guards/vendor-e2e-skip-check.sh` — vendor-e2e skip-count enforcement (used by `deploy-vendor-e2e` job) -- `scripts/ci-guards/digest-validity.sh` — used by `image-and-supply-chain` job -- `scripts/ci-guards/openapi-handler-parity.sh` — used by `image-and-supply-chain` job -- `scripts/ci-guards/coverage-pr-comment.sh` — used by `go-build-and-test` job -- `scripts/check-coverage-thresholds.sh` — used by `go-build-and-test` job - -## Coverage thresholds - -Manifest at `.github/coverage-thresholds.yml`. Each entry has `floor:` (integer percentage) + `why:` (load-bearing context). Lowering a floor REQUIRES corresponding code-side test work — never lower the gate to make CI green. - -To add a new gated package: add an entry to the YAML; no script changes needed. - -## Make targets — three-tier convention - -| Target | When | What | -|---|---|---| -| `make verify` | **Required pre-commit** | gofmt + vet + golangci-lint + go test -short | -| `make verify-deploy` | Optional pre-push | digest-validity + OpenAPI parity + Docker build smoke (server + agent only — fast subset) | -| `make verify-docs` | **Required pre-tag** | QA-doc Part-count + seed-count drift checks | - -## Adding a new check - -| Check type | Where it goes | Auto-picked-up by CI? | -|---|---|---| -| Regression guard (grep / shape pattern) | New `scripts/ci-guards/.sh` script | Yes — loop step iterates `*.sh` | -| Coverage threshold (per-package) | New entry in `.github/coverage-thresholds.yml` | Yes — bash loop reads YAML | -| OpenAPI route exception | New entry in `api/openapi-handler-exceptions.yaml` | Yes — parity script reads YAML | -| Vendor-e2e expected skip | New line in `scripts/ci-guards/vendor-e2e-skip-allowlist.txt` | Yes — skip-check script reads file | -| New CI job | Edit `.github/workflows/ci.yml` directly | n/a (job definition is the source) | - -## Troubleshooting - -| CI step fails | Likely cause | Fix | -|---|---|---| -| `gofmt drift` | source needs `gofmt -w` | `make fmt` locally + commit | -| `go mod tidy drift` | imported a package without committing go.mod | `go mod tidy` + commit | -| `Run staticcheck` | new SA1019 deprecated-API site | migrate the API OR add `//lint:ignore SA1019 ` | -| `Check Coverage Thresholds` | per-package coverage dropped below floor | add tests; do NOT lower the floor | -| `Regression guards` (any `.sh`) | the audit-finding the guard pinned reappeared | read the guard's head-comment block for the closure rationale + fix the regression | -| `Skip-count enforcement` | a vendor sidecar failed to start | check docker logs; fix sidecar; OR if a new Windows-only test was added, add to `scripts/ci-guards/vendor-e2e-skip-allowlist.txt` | -| `Digest validity` | a `@sha256` digest doesn't resolve | re-resolve from registry, replace in compose / Dockerfile | -| `OpenAPI ↔ handler parity` | new router route without operationId | add to `api/openapi.yaml` (preferred) OR `api/openapi-handler-exceptions.yaml` | -| `Docker build smoke` | Dockerfile syntax error or COPY path drift | fix the Dockerfile | -| `CodeQL Analyze` | interprocedural dataflow finding | review the SARIF in Security → Code scanning tab | - -## Status check accounting - -**Current (post-cleanup):** 7 status checks per push. -- 1 × `Go Build & Test` -- 1 × `Frontend Build` -- 1 × `Helm Chart Validation` -- 1 × `deploy-vendor-e2e` -- 1 × `image-and-supply-chain` -- 2 × `CodeQL Analyze ()` (go + javascript-typescript) - -**Pre-cleanup (HEAD `1de61e91`):** 19 status checks. The 12-vendor matrix + 2-vendor Windows matrix collapsed to 1 + 0 respectively; the 3 Go/Frontend/Helm jobs unchanged; 2 CodeQL unchanged; 1 new `image-and-supply-chain` added. - -## Required GitHub branch protection list - -When updating the `master` branch protection rule (Settings → Branches), the "Require status checks to pass" list should be exactly: - -``` -Go Build & Test -Frontend Build -Helm Chart Validation -deploy-vendor-e2e -image-and-supply-chain -Analyze (go) -Analyze (javascript-typescript) -``` - -Old-name checks (`deploy-vendor-e2e ()` × 12, `deploy-vendor-e2e-windows ()` × 2) won't appear on new PRs after the workflow change. Operator removes them from the required list. diff --git a/docs/contributor/gui-qa-checklist.md b/docs/contributor/gui-qa-checklist.md deleted file mode 100644 index 4621d86..0000000 --- a/docs/contributor/gui-qa-checklist.md +++ /dev/null @@ -1,68 +0,0 @@ -# GUI QA Checklist - -> Last reviewed: 2026-05-05 - -Manual GUI verification pass for release sign-off. Vitest covers component-level behavior; this checklist covers end-to-end flows that only land correctly when the React SPA, the REST API, and the database are all wired together. - -## Prereqs - -The full stack must be running and healthy per [`qa-prerequisites.md`](qa-prerequisites.md). Open `https://localhost:8443` in a fresh browser session (Incognito / Private mode is fine — avoids cached state from previous QA passes). - -## Pages to verify - -For each page, the verification is "open it, confirm it renders without console errors, exercise the documented action, confirm the action lands as expected." - -| Page | Action to verify | Expected result | -|---|---|---| -| `/dashboard` | Page loads, all 4 stat cards populate | Total / Active / Expiring / Expired counts match `GET /api/v1/stats/summary` | -| `/certificates` | Inventory list paginates | "Next page" button works; URL updates with cursor; row count consistent | -| `/certificates/` | Detail page opens for any cert | Cert chain renders, deployment status shows, audit timeline visible | -| `/issuers` | Catalog renders all configured issuers | Each issuer card shows last-used / status; clicking opens detail | -| `/issuers/` | Issuer config form | Edit + Save round-trips through `PATCH /api/v1/issuers/` | -| `/issuers/hierarchy` | CA tree view | Multi-level hierarchy renders; admin-gated CRUD buttons present for admins only | -| `/agents` | Fleet view | Online/offline status accurate; OS/arch grouping correct | -| `/agents/` | Agent detail | Last heartbeat, registered date, deployment job history | -| `/agents/groups` | Agent groups CRUD | Create + edit + delete a test group; verify dynamic membership matching | -| `/jobs` | Job queue | Filter by status / type works; click into a job opens detail | -| `/jobs/` | Job detail | Status, retries, logs, owner attribution | -| `/policies` | Renewal policies CRUD | Edit AlertChannels matrix, save, verify backend reflects change | -| `/profiles` | Certificate profiles | EKU constraints + max TTL editable; profile binding works | -| `/notifications` | Notifier config | Test connection button against each configured notifier | -| `/discovery` | Discovery triage | Claim / Dismiss buttons round-trip to backend | -| `/network-scans` | Scan target CRUD | Create scan target, trigger immediate scan, results appear | -| `/audit` | Audit trail | Filter by actor / action / time range; CSV export works | -| `/short-lived` | Short-lived credential dashboard | Live TTL countdown updates; auto-refresh every 10s | -| `/observability` | Observability dashboard | Charts render: expiration heatmap, renewal trends, issuance rate | -| `/health` | Health monitor | TLS endpoint health: healthy / degraded / down states accurate | -| `/digest` | Digest preview | Email preview renders; "Send digest" button dispatches | -| `/owners` | Owners CRUD | Create owner with team, edit, delete (after reassigning certs) | -| `/teams` | Teams CRUD | Create + delete; verify cascade removes orphan owners | -| `/scep` | SCEP admin tabs | Profiles / Intune Monitoring / Recent Activity all populate | -| `/est` | EST admin tabs | Profiles / Recent Activity / Trust Bundle all populate | -| `/login` | Login flow | API key entry persists for the session; bad key rejected | - -## Console hygiene - -Open browser DevTools and confirm: - -- No uncaught exceptions on any page -- No 404 / 500 responses in the Network tab from API calls -- No CORS errors -- No CSP violations - -## Mobile / narrow-viewport - -The dashboard is desktop-first but should not break catastrophically on narrow viewports. Resize the browser to 380px width; confirm: - -- Sidebar collapses to a hamburger menu -- Tables either scroll horizontally or stack on mobile -- Forms remain usable - -## Accessibility spot-check - -- Tab through any single page using only the keyboard. Every interactive element must be reachable, and the focus indicator must be visible. -- Lighthouse accessibility audit on `/dashboard`: target ≥ 90. - -## Sign-off - -Document any deviations in the release sign-off matrix at [`release-sign-off.md`](release-sign-off.md). diff --git a/docs/contributor/qa-prerequisites.md b/docs/contributor/qa-prerequisites.md deleted file mode 100644 index bab41cb..0000000 --- a/docs/contributor/qa-prerequisites.md +++ /dev/null @@ -1,99 +0,0 @@ -# QA Prerequisites - -> Last reviewed: 2026-05-05 - -Operational prereqs for running release QA against certctl. Before any of the contributor-facing testing surfaces (test-environment.md, gui-qa-checklist.md, release-sign-off.md) are useful, the local stack needs to be in a known-good state. - -## Why manual QA on top of automated tests? - -Automated tests mock dependencies and run in isolation. Manual QA validates the full integrated stack: real PostgreSQL, real HTTP, real agent binary, real file I/O, real scheduler timing. It catches issues that unit tests can't: migration ordering, Docker networking, env var parsing, browser rendering, and timing-dependent scheduler behavior. - -## Environment setup - -**Step 1: Start the full stack.** - -```bash -cd deploy && docker compose -f docker-compose.yml -f docker-compose.demo.yml up --build -d -``` - -This builds three containers (postgres, certctl-server, certctl-agent) and runs them on a bridge network. The `--build` flag ensures you're testing the current code, not a stale image. The `demo` overlay is an override file (no `image:` or `build:` of its own) that layers `CERTCTL_DEMO_SEED=true` onto the base — both files must be passed in that order or compose errors with `service "certctl-server" has neither an image nor a build context specified`. The seed populates the database with realistic fixtures. - -**Step 2: Wait for healthy state.** - -```bash -for i in $(seq 1 30); do - STATUS=$(docker compose ps --format json 2>/dev/null | jq -r 'select(.Health != null) | "\(.Name): \(.Health)"' 2>/dev/null) - echo "$STATUS" - echo "$STATUS" | grep -q "unhealthy\|starting" || break - sleep 2 -done -``` - -Why: Docker Compose starts containers in dependency order (postgres → server → agent), but "started" doesn't mean "ready." Health checks confirm postgres accepts connections, the server responds on `/health`, and the agent process is running. - -**Step 3: Set shell variables used throughout the QA flow.** - -```bash -export SERVER=https://localhost:8443 -export API_KEY="change-me-in-production" -export AUTH="Authorization: Bearer $API_KEY" -export CT="Content-Type: application/json" -export CACERT="--cacert ./deploy/test/certs/ca.crt" -``` - -Every curl command in QA docs uses these variables. Setting them once avoids typos and keeps the docs copy-pasteable. - -> **Note:** The default Docker Compose sets `CERTCTL_AUTH_TYPE: none` for the demo overlay, meaning auth is disabled. Tests that exercise auth require flipping this to `api-key`; instructions are in the relevant test docs. - -**Step 4: Build CLI and MCP server binaries on the host.** - -```bash -go build -o certctl-cli ./cmd/cli/... -go build -o certctl-mcp ./cmd/mcp-server/... -``` - -The CLI and MCP server are separate binaries that talk to the server over HTTP. Building them verifies the code compiles and produces the executables you'll test later. - -## Demo data baseline - -The seed data (`migrations/seed.sql` + `migrations/seed_demo.sql`) pre-populates the database with realistic fixtures. Confirm it loaded: - -```bash -curl -s $CACERT -H "$AUTH" $SERVER/api/v1/stats/summary | jq . -``` - -**Expected shape:** - -```json -{ - "total_certificates": 15, - "active_certificates": ..., - "expiring_certificates": ..., - "expired_certificates": ..., - "pending_renewals": ... -} -``` - -**Reference IDs in the demo data** (used across QA docs): - -| Resource | IDs | Count | -|---|---|---| -| Teams | `t-platform`, `t-security`, `t-payments`, `t-frontend`, `t-data` | 5 | -| Owners | `o-alice`, `o-bob`, `o-carol`, `o-dave`, `o-eve` | 5 | -| Policies | `rp-standard`, `rp-urgent`, `rp-manual` | 3 | -| Issuers | `iss-local`, `iss-acme-le`, `iss-stepca`, `iss-digicert` | 4 | -| Agents | `ag-web-prod`, `ag-web-staging`, `ag-lb-prod`, `ag-iis-prod`, `ag-data-prod` | 5 | -| Targets | `tgt-nginx-prod`, `tgt-nginx-staging`, `tgt-f5-prod`, `tgt-iis-prod`, `tgt-nginx-data` | 5 | -| Profiles | `prof-standard-tls`, `prof-internal-mtls`, `prof-short-lived`, `prof-high-security` | 4 | -| Certificates | `mc-api-prod`, `mc-web-prod`, `mc-pay-prod`, etc. | 15 | -| Agent Groups | `ag-linux-prod`, `ag-linux-amd64`, `ag-windows`, `ag-datacenter-a`, `ag-manual` | 5 | -| Network Scan Targets | `nst-dc1-web`, `nst-dc2-apps`, `nst-dmz` | 3 | - -## Once these are green - -Move to the appropriate downstream surface: - -- [`test-environment.md`](test-environment.md) — full local environment tutorial with real CAs (Pebble, step-ca, etc.) -- [`gui-qa-checklist.md`](gui-qa-checklist.md) — manual GUI test pass -- [`release-sign-off.md`](release-sign-off.md) — release-day checklist -- [`testing-strategy.md`](testing-strategy.md) — what we test in CI vs daily deep-scan vs manual QA diff --git a/docs/contributor/qa-test-suite.md b/docs/contributor/qa-test-suite.md deleted file mode 100644 index 3ecc79c..0000000 --- a/docs/contributor/qa-test-suite.md +++ /dev/null @@ -1,445 +0,0 @@ -# QA Test Suite Guide (`qa_test.go`) - -> Last reviewed: 2026-05-05 - -> **Audience:** Anyone running release QA for certctl — whether you're a first-time contributor or the maintainer cutting a release tag. -> -> **Self-contained.** Through 2026-05-04 this doc was a companion to a separate `docs/testing-guide.md` (the *what* to test) — that companion was pruned during the Phase 5 docs overhaul (its content dispersed across the audience-organized doc tree). The Part-by-Part Coverage Map below is now the canonical inventory of QA Parts. - ---- - -## Test Suite Health (regenerate via `make qa-stats`) - -> Snapshot at HEAD. Re-run `make qa-stats` to refresh; the QA-doc seed-count drift guard (`.github/workflows/ci.yml::QA-doc seed-count drift guard`) catches out-of-date cert / issuer counts on every PR. The Part-count drift guard retired in the 2026-05-04 docs overhaul Phase 5 (testing-guide.md was pruned; Part counts are now tracked inside `qa_test.go` itself, not against an external doc). **Last regenerated: 2026-04-27 (Bundle P).** - -| Metric | Value | Target | Status | -|---|---|---|---| -| Backend test files | 221 | n/a | ℹ | -| Backend `Test*` functions | 2,454 | n/a | ℹ | -| Backend `t.Run` subtests | 778 | n/a | ℹ | -| Frontend test files | 38 | n/a | ℹ | -| Fuzz targets | 11 | ≥10 (one per hand-rolled parser) | ✓ | -| `t.Skip` sites | 60 | each carries valid rationale (Bundle O audit) | ✓ | -| `qa_test.go` Part_* subtests | 53 | covers 49 of 56 historical QA Parts directly + Parts 15–17 indirectly via Parts 42–46 | ✓ | -| Existential cluster line cov (post-Bundle-J + L.B + Bundle 0.7) | acme 55.6%, stepca 90.4%, local-issuer ≥86%, crypto ≥85% | ≥95% | △ ACME below; tracked in `coverage-matrix.md` | -| Mutation kill rate (Existential) | unmeasured (operator-runnable per Strengthening #5) | ≥90% | ⚠ | -| Race detector clean (`-count=10`) | partial (`-count=3` clean per Phase 0) | 0 races | ⚠ | - -## What Is This File? - -`deploy/test/qa_test.go` is a single Go test file (~1700 lines) that automates the historical QA Part inventory (preserved in the Part-by-Part Coverage Map below) against a running certctl Docker Compose demo stack. It replaces the legacy `qa-smoke-test.sh` bash script. - -It covers **49 of 56 Parts** of the testing guide as automation; the remaining 7 are -either manual-only by design or pending QA-suite coverage: - -- **49 `Part_*` automation wrappers**, **~159 leaf subtests** — API calls, database queries, source file checks, performance benchmarks -- **11 fully skipped Parts** — with documented reasons (external CAs, Windows, browser-only, etc.) — see "What This Test Does NOT Cover" below -- **4 Parts NOT YET AUTOMATED** — Parts 23 (S/MIME & EKU), 24 (OCSP/CRL), 55 (Agent Soft-Retirement), 56 (Notification Retry & Dead-Letter) — must be tested manually until QA-suite automation lands; the Part-by-Part Coverage Map below describes the surface area each Part covers -- **Manual-only flows** in addition: GUI flows, scheduler timing, Docker log inspection — must be done by a human (Coverage Map below describes each) - -## Architecture - -```mermaid -flowchart LR - QA["qa_test.go (//go:build qa)

TestQA(t *testing.T)
├─ Part01_Infra
├─ Part02_Auth
├─ Part03_CertCRUD
├─ ...
└─ Part52_HelmChart"] - subgraph Stack["certctl demo stack
docker-compose.yml + docker-compose.demo.yml"] - Server["certctl-server :8443"] - Postgres["postgres :5432"] - Agents["certctl-agent (×N)
↑ seed_demo.sql provisions 12 agent rows
(1 active, 2 retired, 9 reserved/sentinel)
for the soft-retire / FSM coverage Parts 55–56 exercise"] - end - QA --> Stack -``` - -> **Multi-agent demo stack (Bundle Q / L-004 closure).** The demo -> stack runs a single live `certctl-agent` container by default but -> the database is seeded with 12 agent rows (`migrations/seed_demo.sql`, -> grep `mc-* | ag-*` IDs). The "(×N)" notation reflects the seed-data -> reality: Parts 04 (Agents Listing), 05 (Agent Heartbeats), 55 -> (Agent Soft-Retirement), and FSM coverage tables in -> `coverage-audit-2026-04-27/tables/fsm-coverage.md` exercise the full -> multi-agent population, not the one live container. Operators -> running the QA suite in a parallel-agent topology should set -> `AGENT_COUNT=N` in compose-override and re-derive the seed counts -> via `make qa-stats`. - -Key design choices: - -- **Build tag:** `//go:build qa` — never runs during `go test ./...` or CI. Only runs when explicitly requested. -- **Package:** `integration_test` — same package as `integration_test.go` (which uses `//go:build integration` for the test stack). They coexist but never run together. -- **Zero internal imports:** Uses only stdlib + `lib/pq` (from `go.mod`). All API interactions are plain HTTP. All JSON is decoded into lightweight local structs (`qaCert`, `qaJob`, etc.) — not the internal domain types. -- **Self-cleaning:** Tests that create data use `t.Cleanup()` to delete it afterward. The seed data is not modified. - -## Prerequisites - -1. **Docker Compose demo stack running:** - ```bash - cd deploy - docker compose -f docker-compose.yml -f docker-compose.demo.yml up --build -d - ``` - Wait ~15 seconds for health checks to pass. - -2. **Go 1.22+** installed (the project uses Go 1.25 in `go.mod`, but 1.22+ works for running tests). - -3. **PostgreSQL port exposed** — the demo stack exposes port 5432 for database verification tests (table counts, schema checks). - -4. **Repository checkout** — source file verification tests (`fileExists`, `fileContains`) read files relative to `qaRepoDir` (default: `../..` from `deploy/test/`). - -## Running the Tests - -### Full suite -```bash -cd deploy/test -go test -tags qa -v -timeout 10m ./... -``` - -### Single Part -```bash -go test -tags qa -v -run TestQA/Part03 ./... -``` - -### Single subtest -```bash -go test -tags qa -v -run TestQA/Part03_CertCRUD/Create_Minimal ./... -``` - -### With custom environment -```bash -CERTCTL_QA_SERVER_URL=https://staging.internal:8443 \ -CERTCTL_QA_API_KEY=my-staging-key \ -CERTCTL_QA_DB_URL=postgres://certctl:secret@db.internal:5432/certctl?sslmode=require \ -CERTCTL_QA_REPO_DIR=/path/to/certctl \ -go test -tags qa -v -timeout 10m ./... -``` - -### Environment Variables - -| Variable | Default | Description | -|---|---|---| -| `CERTCTL_QA_SERVER_URL` | `https://localhost:8443` | certctl server URL (HTTPS-only as of v2.2) | -| `CERTCTL_QA_API_KEY` | `change-me-in-production` | API key for Bearer auth | -| `CERTCTL_QA_DB_URL` | `postgres://certctl:certctl@localhost:5432/certctl?sslmode=disable` | PostgreSQL connection string | -| `CERTCTL_QA_REPO_DIR` | `../..` | Path to certctl repo root (for source file checks) | -| `CERTCTL_QA_CA_BUNDLE` | `./certs/ca.crt` | PEM CA bundle pinned for TLS verification. The demo stack's `certctl-tls-init` container writes here. | -| `CERTCTL_QA_INSECURE` | `false` | Set to `"true"` to skip TLS verification (e.g. before the init container finishes). Never use outside the demo harness. | - -## Part-by-Part Coverage Map - -This table shows what each Part tests and what's left for manual verification. - -| Part | Testing Guide Section | Automated Subtests | What's Automated | What's Manual | -|------|----------------------|-------------------|-----------------|--------------| -| 1 | Infrastructure & Deployment | 8 | Table count, health/ready endpoints, seed data counts (certs, agents, issuers, targets, policies) | Docker container health, log inspection, volume mounts | -| 2 | Authentication & Security | 4 | No-auth 401, bad-key 401, health-no-auth 200, no private keys in API | CORS preflight, rate limiting (429 + Retry-After), TLS config | -| 3 | Certificate Lifecycle | 10 | Create (minimal + full), get, 404, list pagination, status/issuer filters, sparse fields, update, archive | Deployment trigger, version history, certificate detail UI | -| 4 | Renewal Workflow | 3 | Trigger renewal, 404 on nonexistent, agent work endpoint | AwaitingCSR flow, agent key generation, full issuance cycle | -| 5 | Revocation | 5 | Revoke (default reason), already-revoked, nonexistent, invalid reason, CRL JSON | DER CRL, OCSP responder, revocation notifications | -| 6 | Policies & Profiles | 6 | Policy CRUD (create/delete), invalid type 400, profile CRUD, list | Policy violation detection, profile enforcement on CSR | -| 7 | Ownership & Teams | 4 | Team CRUD, owner CRUD, agent groups list | Owner notification routing, dynamic group matching | -| 8 | Job System | 2 | List jobs, 404 on nonexistent | Job state transitions, approval workflow, cancellation | -| 9 | Issuer Connectors | 4 | List, get detail, create (GenericCA), missing name 400 | Test connection, issuer-specific issuance flow | -| 10 | Sub-CA Mode | SKIP | — | Requires CA cert+key on disk | -| 11 | ACME ARI | SKIP | — | Requires ARI-capable CA | -| 12 | Vault PKI | SKIP | — | Requires live Vault server | -| 13 | DigiCert | SKIP | — | Requires DigiCert sandbox | -| 14 | Target Connectors | 3 | List, create NGINX target, delete 204 | Deploy to real target, validate deployment | -| 15–17 | Apache/HAProxy, Traefik/Caddy, IIS | — | (Covered by source checks in Parts 42–46) | Requires real services or Windows | -| 18 | Agent Operations | 3 | Heartbeat (register), metadata check, auto-create on heartbeat | Agent binary behavior, key storage, discovery scan | -| 19 | Agent Work Routing | 1 | Empty work for agent with no targets | Scoped job assignment, multi-target fan-out | -| 20 | Post-Deployment Verification | 1 | 404 on nonexistent job verification | TLS probing, fingerprint comparison | -| 21 | EST Server | 2 | CACerts (200 + content-type), CSRAttrs (200/204) | simpleenroll with CSR, simplereenroll, PKCS#7 parsing | -| 22 | Certificate Export | 3 | PEM export, PKCS#12 export, 404 on nonexistent | Download mode, file content validation | -| 23 | S/MIME & EKU Support | 0 (NOT AUTOMATED) | — | S/MIME profile creation; EKU enforcement on issuance; SMIMECapabilities extension presence in issued cert; rejection of profile-violating EKU on CSR. Test manually — see the Coverage Map row | -| 24 | OCSP Responder & DER CRL | 0 (NOT AUTOMATED) | — | OCSP request/response (RFC 6960), DER CRL generation, status (Good/Revoked/Unknown), Must-Staple coordination. Test manually — see the Coverage Map row | -| 25 | Certificate Discovery | 5 | List discovered, summary, list scan targets, create target, invalid CIDR 400 | Agent filesystem scan, claim/dismiss workflow | -| 26 | Enhanced Query API | 4 | Sort descending, cursor pagination, time-range filter, invalid sort field | Field projection correctness, cursor token cycling | -| 27 | Request Body Size Limits | 1 | 2MB body rejected (413/400) | Exact limit boundary (1MB) | -| 28 | CLI | SKIP | — | Requires compiled `certctl-cli` binary | -| 29 | MCP Server | SKIP | — | Requires compiled `mcp-server` binary + stdio | -| 30 | Observability | 7 | Dashboard summary, certs by status, expiration timeline, job trends, issuance rate, JSON metrics (uptime + gauges), Prometheus (content-type + 4 metric names) | Chart rendering (GUI), Grafana import | -| 31 | Notifications | 2 | List, 404 on nonexistent | Notification content, mark-read, email/Slack delivery | -| 32 | Audit Trail | 3 | List events (≥10), PUT immutability, DELETE immutability | Actor attribution, body hash, time range filters | -| 33 | Background Scheduler | SKIP | — | Timing-dependent; verify via Docker logs | -| 34 | Structured Logging | SKIP | — | Requires Docker log inspection | -| 35 | GUI Testing | SKIP | — | Requires browser | -| 36–37 | Issuer Catalog, Frontend Audit | SKIP | — | Requires browser | -| 38 | Error Handling | 5 | Malformed JSON, missing required field, method not allowed, UTF-8 CN, empty body | Stack trace suppression, error response format | -| 39 | Performance | 5 | List certs < 200ms, stats < 500ms, metrics < 200ms, Prometheus < 300ms, audit < 500ms | Load testing, concurrent request handling | -| 40 | Documentation | 8 | README, quickstart, architecture, connectors exist; migration guides exist; 8 issuer types in docs; 11 target types in docs | Content accuracy, link validity | -| 41 | Regression | 3 | DELETE 204, per_page max fallback, network scan target seed count | `errors.Is(errors.New())` anti-pattern source scan | -| 42 | Envoy Target | 5 | Domain type, connector file, test file, OpenAPI, agent dispatch | Envoy deployment test, SDS config | -| 43 | Postfix/Dovecot | 3 | Domain types (Postfix + Dovecot), connector file, OpenAPI | Mail server deployment test | -| 44 | SSH Target | 4 | Domain type, connector file, agent dispatch (`sshconn`), OpenAPI | SSH deployment test (requires target host) | -| 45 | Windows Certificate Store | 3 | Domain type, connector file, shared certutil package | Windows deployment (requires Windows) | -| 46 | Java Keystore | 3 | Domain type, connector file, OpenAPI | JKS deployment (requires keytool) | -| 47 | Certificate Digest Email | 3 | Preview endpoint (200/503), service file, adapter file | SMTP delivery, HTML template rendering | -| 48 | Dynamic Issuer Config | 4 | Crypto package exists, create ACME issuer via API, config redaction check, migration exists | Test connection flow, registry rebuild | -| 49 | Dynamic Target Config | 2 | Create NGINX target via API, migration exists | Test connection via agent heartbeat | -| 50 | Onboarding Wizard | 2 | Wizard component exists, docker-compose split (clean vs demo) | Wizard UI flow, step completion | -| 51 | ACME Profile Selection | 3 | Profile module exists, frontend config, RFC 9702→9773 renumber check | Profile-aware issuance against real CA | -| 52 | Helm Chart | 5 | Chart.yaml, values.yaml, 4 templates exist, securityContext, health probes | `helm template` rendering, `helm install` | -| 53 | Kubernetes Secrets Target Connector (M47) | 18 | Config validation (namespace DNS-1123, secret name DNS subdomain, label keys, required fields), deployment (create/update Secret, chain concatenation, error propagation), validation (serial comparison, not-found, empty cert) | GUI target wizard KubernetesSecrets fields (namespace, secret_name, labels, kubeconfig_path), Helm RBAC toggle, TargetDetailPage type label | -| 54 | AWS ACM Private CA Issuer Connector (M47) | 23 | Config validation (region, CA ARN regex, signing algorithm whitelist, validity_days, defaults), issuance (full flow, empty CSR, errors), renewal (reuses issuance), revocation (reason mapping, default, errors), GetOrderStatus completed, GetCACertPEM (success/chain/error), GetRenewalInfo nil | GUI issuer wizard AWSACMPCA fields (region, ca_arn, signing_algorithm, validity_days, template_arn), seed data visibility, create issuer flow | -| 55 | Agent Soft-Retirement (I-004) | 0 (NOT AUTOMATED) | — | Soft-retire vs hard-retire; force flag; reason capture; foreign-key cascade behavior on retired-agent cert ownership; reactivation. Test manually — see the Coverage Map row | -| 56 | Notification Retry & Dead-Letter Queue (I-005) | 0 (NOT AUTOMATED) | — | Retry loop with exponential backoff, dead-letter transition after N retries, requeue endpoint (`POST /api/v1/notifications/{id}/requeue`), idempotency on retry. Test manually — see the Coverage Map row | - -**Totals (verified 2026-04-27):** 49 `Part_*` automation wrappers, ~159 leaf subtests, 11 fully -skipped Parts, 4 Parts not yet automated (23, 24, 55, 56), and an unspecified count of manual-only -flows (GUI, scheduler timing, Docker log inspection). Run `grep -cE 't\.Run\("Part[0-9]+_' deploy/test/qa_test.go` to count Part_* automation wrappers -and `grep -cE 't\.Run\("Part[0-9]+_' deploy/test/qa_test.go` to re-verify. - -## Coverage by Risk Class - -A buyer's QA lead reading this doc wants "where are the existential bugs caught?" — Bundle P / Strengthening #1 surfaces that view directly. The table below classifies each Part by risk class so reviewers can answer the existential-coverage question in one glance. - -| Risk class | Description | Parts in scope | Automation status | -|---|---|---|---| -| **Existential** (Critical paths — bugs would compromise CA, leak keys, mis-issue, bypass revocation) | Crypto, PKCS#7, local-issuer, OCSP/CRL, agent keygen, CSR validation | 5 (Revocation), 21 (EST), 23 (S/MIME EKU), 24 (OCSP/CRL), 47 (Digest with cert content), 53 (K8s Secrets), 54 (AWS PCA) | 5/7 automated; Parts 23 + 24 pending (Bundle I Skip stubs in `qa_test.go`; manual playbook in the Coverage Map below) | -| **High** (FSM corruption, credential leak, authn/z weakening) | Renewal, jobs, agents, issuers, deployment, scheduler | 4, 7, 8, 9, 18, 19, 20, 22, 25, 28, 29, 32, 33, 48, 49, 55, 56 | 14/17 automated; CLI / MCP / scheduler-loop are inherently SKIP (require compiled binaries / Docker logs); Parts 55 + 56 pending | -| **Medium** (Operational pain or silent data drift) | Targets, notifiers, observability, error handling, performance, regression | 14, 15-17, 30, 31, 38, 39, 40, 41, 42, 43, 44, 45, 46 | 14/14 automated (15-17 indirect via Parts 42–46) | -| **Low** (Hygiene) | Documentation, docs verification | 40 (Documentation), 50 (Onboarding) | 2/2 automated | -| **Frontend** (XSS, render correctness, mutation contracts) | GUI testing | 35, 36-37 | 0/3 automated in this suite (Vitest covers separately under `web/`); this doc punts to manual + Vitest | -| **Audit-relevant** | Audit trail, body-size limits, request limits, Helm chart deploy posture | 27, 32, 51, 52 | 4/4 automated | - -This is the table acquisition reviewers screenshot for their report. When a new Part_* subtest lands in `qa_test.go`, classify it here. - -## Test Categories - -The automated tests fall into four categories: - -### 1. API Integration Tests (majority) -Make real HTTP requests to the running server and verify status codes, response structure, and JSON field values. Examples: -- `POST /api/v1/certificates` with valid payload → 201 -- `GET /api/v1/certificates?status=Active` → all returned certs have `status: "Active"` -- `DELETE /api/v1/certificates/mc-qa-full` → 204 - -### 2. Database Verification Tests -Connect directly to PostgreSQL and verify schema state: -- Table count ≥ 19 (from migrations 000001–000010) -- Useful for catching migration regressions - -### 3. Source File Verification Tests -Read files from the repo checkout and verify structure: -- Domain types exist in `internal/domain/connector.go` (e.g., `TargetTypeEnvoy`) -- Connector implementations exist (e.g., `internal/connector/target/envoy/envoy.go`) -- Documentation contains expected content (all issuer/target types listed) -- No stale RFC 9702 references (replaced by RFC 9773) - -### 4. Performance Spot Checks -Timed API requests with threshold assertions: -- `GET /api/v1/certificates?per_page=15` < 200ms -- `GET /api/v1/stats/summary` < 500ms -- `GET /api/v1/metrics/prometheus` < 300ms - -## What This Test Does NOT Cover - -These gaps must be filled by manual testing — see each Coverage Map row for surface-area description: - -### Not Yet Automated (Parts 23, 24, 55, 56) - -These historical QA Parts are listed in the Coverage Map below but have no `Part_*` automation -in `qa_test.go` yet. They are operator-runnable from the manual playbook; QA-suite -automation should land before the next acquisition-grade release. - -- **Part 23: S/MIME & EKU Support** — profile-driven EKU enforcement; SMIMECapabilities extension -- **Part 24: OCSP Responder & DER CRL** — OCSP request/response correctness, CRL generation, Must-Staple coordination -- **Part 55: Agent Soft-Retirement (I-004)** — soft vs hard retire, FK cascade, reactivation -- **Part 56: Notification Retry & Dead-Letter Queue (I-005)** — retry semantics, dead-letter transition, requeue - -### External CA Integrations (Parts 10–13) -- **Sub-CA mode** — requires CA cert+key files on disk -- **ACME ARI** — requires a CA that supports RFC 9773 Renewal Information -- **Vault PKI** — requires a running HashiCorp Vault instance -- **DigiCert / Sectigo / Google CAS** — requires sandbox API credentials - -### Browser/GUI Testing (Parts 35–37, 50) -- Dashboard chart rendering (Recharts) -- Onboarding wizard step-by-step flow -- Issuer catalog card layout and create wizard -- Bulk operations UI (multi-select, progress bars) -- Discovery triage workflow - -### Real Deployment Testing (Parts 15–17) -- NGINX/Apache/HAProxy file write + reload -- Traefik/Caddy file provider or API reload -- IIS PowerShell/WinRM (requires Windows) -- F5 BIG-IP iControl REST (requires appliance or mock) -- SSH agentless deployment (requires target host) - -### Agent Binary Behavior (Parts 18, 28–29) -- Agent-side ECDSA key generation and CSR submission -- Agent filesystem discovery scan -- CLI tool (`certctl-cli`) — all 10 subcommands -- MCP server (`mcp-server`) — stdio transport - -### Timing-Dependent Tests (Parts 33–34) -- Background scheduler loop execution (renewal, jobs, health, notifications, digest, network scan) -- Structured logging format verification (requires Docker log parsing) - -## How This Relates to `integration_test.go` - -Both files live in `deploy/test/` in the same Go package (`integration_test`): - -| | `qa_test.go` | `integration_test.go` | -|---|---|---| -| **Build tag** | `//go:build qa` | `//go:build integration` | -| **Target stack** | Demo (`docker-compose.yml` + `docker-compose.demo.yml`) | Test (`docker-compose.test.yml`) | -| **Port** | 8443 | Different (test stack config) | -| **Seed data** | `seed_demo.sql` (32 certs, 12 agents, 13 issuers, 8 targets, realistic history) | Minimal (created by tests) | -| **CA backends** | Local CA only (demo mode) | Pebble ACME, step-ca, NGINX | -| **Purpose** | Release QA — broad coverage, spot checks | Functional — end-to-end issuance, renewal, revocation against real CAs | -| **Run frequency** | Before each release tag | CI on every PR | - -They are complementary. Integration tests prove the machinery works. QA tests prove the product works at release quality. - -## Seed Data Reference - -The QA tests depend on `migrations/seed_demo.sql`. Key IDs used: - -### Certificates (32 total in `managed_certificates`) - -The full canonical list is generated by: -``` -sed -n '/^INSERT INTO managed_certificates/,/^;/p' migrations/seed_demo.sql \ - | grep -oE "^\s*\('mc-[a-z0-9_-]+" | sed -E "s/^\s*\('//" | sort -u -``` - -Hand-listing is unsustainable as the seed grows; tests reference IDs by lookup, not by enumeration. -Sample IDs: `mc-api-prod`, `mc-web-prod`, `mc-pay-prod`, `mc-compromised`, `mc-smime-bob`, `mc-edge-eu`, `mc-k8s-ingress`, `mc-wildcard-prod`. See `migrations/seed_demo.sql:147` onward. - -### Agents (12 total in `agents` table) - -8 named workload agents + 1 server-side sentinel + 3 cloud-discovery sentinels: - -- **Workload agents:** `ag-web-prod`, `ag-web-staging`, `ag-lb-prod`, `ag-iis-prod`, `ag-data-prod`, `ag-edge-01`, `ag-k8s-prod`, `ag-mac-dev` -- **Server-side sentinel:** `server-scanner` -- **Cloud-discovery sentinels:** `cloud-aws-sm`, `cloud-azure-kv`, `cloud-gcp-sm` - -Full list via: -``` -sed -n '/^INSERT INTO agents/,/^;/p' migrations/seed_demo.sql \ - | grep -oE "^\s*\('[a-z][a-z0-9_-]+" | sed -E "s/^\s*\('//" -``` - -(The `agent_groups` table also contains entries with `ag-*` IDs — `ag-linux-prod`, `ag-windows`, `ag-datacenter-a`, `ag-arm64`, `ag-manual` — but those are *group* IDs, not agents. Don't confuse the two.) - -### Issuers (13 total) - -`iss-local`, `iss-acme-le`, `iss-stepca`, `iss-acme-zs`, `iss-openssl`, `iss-vault`, `iss-digicert`, `iss-sectigo`, `iss-googlecas`, `iss-awsacmpca`, `iss-entrust`, `iss-globalsign`, `iss-ejbca`. - -Full list via: -``` -sed -n '/^INSERT INTO issuers/,/^;/p' migrations/seed_demo.sql \ - | grep -oE "^\s*\('iss-[a-z0-9_-]+" | sed -E "s/^\s*\('//" -``` - -### Targets (8 total in `deployment_targets`) -`tgt-nginx-prod`, `tgt-nginx-staging`, `tgt-haproxy-prod`, `tgt-apache-prod`, `tgt-iis-prod`, `tgt-traefik-prod`, `tgt-caddy-prod`, `tgt-nginx-data` - -### Network Scan Targets (4 total in `network_scan_targets`) -`nst-dc1-web`, `nst-dc2-apps`, `nst-dmz`, `nst-edge` - -**Maintenance note:** when adding new seed rows, also update this section, OR remove the -per-table counts and rely on the `sed | grep` commands so the doc stops drifting on every -seed-data change. A CI guard that fails when the doc count diverges from the seed file is -proposed in `coverage-audit-2026-04-27/tables/qa-doc-strengthening.md` (Strengthening #6). - -## Troubleshooting - -### "Server unreachable" on startup -The test pings `GET /health` before running anything. If this fails: -```bash -# Check if the stack is running -docker compose -f docker-compose.yml -f docker-compose.demo.yml ps - -# Check server logs -docker compose -f docker-compose.yml -f docker-compose.demo.yml logs certctl-server - -# Check if the port is exposed (self-signed cert — pin CA bundle) -curl --cacert ./deploy/test/certs/ca.crt -s https://localhost:8443/health -``` - -### "connect to QA DB" failure -The database tests connect directly to PostgreSQL. Ensure port 5432 is exposed: -```bash -docker compose -f docker-compose.yml -f docker-compose.demo.yml port postgres 5432 -``` - -### Performance tests flaking -The performance thresholds (200ms, 300ms, 500ms) assume a local Docker stack. On slow CI runners or remote Docker hosts, increase the thresholds or skip Part 39: -```bash -go test -tags qa -v -run 'TestQA/Part(?!39)' ./... -``` - -### Source file checks failing -The `fileExists` and `fileContains` helpers read from `CERTCTL_QA_REPO_DIR` (default `../..`). If running from a non-standard location: -```bash -CERTCTL_QA_REPO_DIR=/absolute/path/to/certctl go test -tags qa -v ./... -``` - -## Release Day Sign-Off Matrix - -Before tagging a release, the QA-on-call engineer signs off on each row. This matrix replaces the previous ad-hoc release checklist and ties test execution directly to release approval. Acquisition-grade releases have this kind of matrix; the doc previously didn't. - -| Sign-off | Evidence | Owner | Result | Date | -|---|---|---|---|---| -| `make verify` clean on master | CI run URL | Eng-on-call | ☐ | | -| `go test -tags qa ./deploy/test/...` ≥ 95% pass rate (skips counted as pass) | Test output | QA-on-call | ☐ | | -| `go test -race -count=10 ./internal/...` 0 races | `tool-output/race-x10.txt` | QA-on-call | ☐ | | -| Coverage ≥ thresholds in `ci.yml` (service / handler / crypto / local-issuer / acme / stepca / mcp) | `tool-output/cover-summary.txt` | QA-on-call | ☐ | | -| Helm chart `helm lint && helm template` clean | `tool-output/helm.txt` | DevOps-on-call | ☐ | | -| All `t.Skip` sites have current rationales (see Bundle O audit; CI guard catches new orphans) | `make qa-stats` t.Skip count | QA-on-call | ☐ | | -| Frontend: Vitest run clean; per-page coverage ≥ 70% | `web/tool-output/vitest.txt` | Frontend-on-call | ☐ | | -| Manual Parts 23, 24, 55, 56 executed (or explicit defer with rationale) | This sheet | QA-on-call | ☐ | | -| Demo stack `docker compose up -d --build` smoke (`/health` 200, `/ready` 200) | curl receipt | QA-on-call | ☐ | | -| `govulncheck ./...` clean (or deferred-call advisories tracked in `gap-backlog`) | `tool-output/govulncheck.json` | Security-on-call | ☐ | | -| QA-doc drift guards green (Part-count + cert-count) | CI run URL | QA-on-call | ☐ | | -| FSM transition coverage tables (`coverage-audit-2026-04-27/tables/fsm-coverage.md`) — Existential FSMs ≥80% legal + 100% illegal | This sheet | QA-on-call | ☐ | | - -**Sign-off owner:** ______________________   **Date:** ______   **Tag:** v__.__.__ - -## Mutation Testing Targets & Kill Rate - -Mutation testing exposes which assertions are actually load-bearing — tests can pass against broken code if mutations survive, which is a coverage trap. The audit's Phase 0 attempted to run `go-mutesting` on the Existential cluster but was blocked by a Go 1.25 / arm64 incompatibility in `osutil@v1.6.1` (uses `syscall.Dup2` which is undefined on linux/arm64). The operator-runnable workaround uses a fork that targets `unix.Dup3` instead. - -| Package | Risk class | Target kill rate | Last measured | Tool | -|---|---|---|---|---| -| `internal/crypto` | Existential | ≥90% | unmeasured (sandbox-blocked, operator-runnable) | go-mutesting | -| `internal/pkcs7` | Existential | ≥90% | unmeasured | go-mutesting | -| `internal/connector/issuer/local` | Existential | ≥90% | unmeasured | go-mutesting | -| `internal/connector/issuer/acme` | Existential | ≥80% (catch-up; failure-mode coverage 55.6% per Bundle J) | unmeasured | go-mutesting | -| `internal/connector/issuer/stepca` | Existential | ≥85% (post-Bundle-L.B coverage at 90.4%) | unmeasured | go-mutesting | -| `internal/api/middleware` | High | ≥80% | unmeasured | go-mutesting | -| `internal/validation` | Existential (CWE-78 / CWE-113 boundary) | ≥90% | unmeasured | go-mutesting | -| `web/src/utils/safeHtml.ts` | Frontend (XSS gate) | ≥90% | unmeasured | Stryker | - -### Operator command (per package) - -```bash -# Use the avito-tech fork that supports linux/arm64 + Go 1.25. -go install github.com/avito-tech/go-mutesting/cmd/go-mutesting@latest - -mkdir -p tool-output -$(go env GOPATH)/bin/go-mutesting --debug ./internal/crypto/... \ - > tool-output/mutation-crypto.txt 2>&1 -grep -oE 'mutation score is [0-9.]+' tool-output/mutation-crypto.txt | tail -1 -``` - -**Acceptance:** ≥80% (Existential) / ≥70% (High). Anything below is a Medium finding; triage entries go in `coverage-audit-2026-04-27/gap-backlog.md`. This subsection moves mutation testing from "future work" to "documented release gate." - -## Adding New Tests - -When a new feature ships: - -1. **Add a Part section** in `qa_test.go` following the numbering convention in the Coverage Map below -2. **API tests**: use `c.get()`, `c.post()`, `c.bodyStr()`, `c.getJSON()`, `c.timedGet()` -3. **Source checks**: use `fileExists(t, "relative/path")` and `fileContains(t, "path", "substring")` -4. **DB checks**: use `openQADB(t)` and `db.queryInt(t, "SELECT ...")` -5. **Cleanup**: always use `t.Cleanup()` for data created during tests -6. **Skip if external**: use `t.Skip("Requires X — manual test")` with a clear reason - -## Version History - -- **v1.3** (April 2026, post-Bundle-P) — QA Doc Strengthening shipped. New top-of-doc Test Suite Health dashboard (regenerated via `make qa-stats`). New Coverage by Risk Class table after the Coverage Map. New Release Day Sign-Off Matrix and Mutation Testing Targets sections. CI seed-count + Part-count drift guards land in `.github/workflows/ci.yml` so future doc drift fails CI. Bundle P closes M-007 / M-010 / M-011 / M-012 (structural strengthening) + M-008 (Mutation Testing Targets). -- **v1.2** (April 2026, post-coverage-audit) — Documented Parts 55–56 (I-004 Agent Soft-Retirement, I-005 Notification Retry & Dead-Letter) and surfaced Parts 23–24 (S/MIME & EKU; OCSP/CRL) as not-yet-automated. 56 Parts total in `testing-guide.md`; 49 live `Part_*` automation wrappers in `qa_test.go` + 4 new `Skip` stubs for Parts 23/24/55/56 = 53 wrappers (Parts 15–17 remain covered by source-checks in Parts 42–46). Reconciled seed-data section to actual `seed_demo.sql` counts (12 agents, 13 issuers; certs were already accurate at 32). Bundle I of the 2026-04-27 coverage-audit closure plan. -- **v1.1** (April 2026) — Added Parts 53–54 (M47: Kubernetes Secrets target + AWS ACM PCA issuer). 54 Parts total, ~164 automated subtests. -- **v1.0** (April 2026) — Initial release covering all 52 Parts of testing-guide.md v2.1. Replaces `qa-smoke-test.sh`. diff --git a/docs/contributor/release-sign-off.md b/docs/contributor/release-sign-off.md deleted file mode 100644 index 3a96590..0000000 --- a/docs/contributor/release-sign-off.md +++ /dev/null @@ -1,93 +0,0 @@ -# Release Sign-Off - -> Last reviewed: 2026-05-05 - -Release-day checklist for tagging a new certctl release. Walks through the gates that must be green before pushing the tag, in the order they should be verified. - -## Pre-release: code state - -| Gate | How to check | Pass | -|---|---|---| -| `master` is at the commit you intend to tag | `git log -1 --format='%H %s'` | ☐ | -| Working tree clean | `git status -sb` | ☐ | -| Local matches GitHub | `curl -sS https://api.github.com/repos/certctl-io/certctl/commits/master \| grep -oE '"sha": "[a-f0-9]+"' \| head -1` matches local | ☐ | -| `WORKSPACE-CHANGELOG.md` updated with the release's milestones | manual review | ☐ | -| `certctl/CHANGELOG.md` updated (release-facing) | manual review | ☐ | -| Migration ladder ends cleanly | `ls migrations/*.up.sql \| sort \| tail -3` shows the right last migration | ☐ | - -## Pre-release: automated gates (CI) - -| Gate | How to check | Pass | -|---|---|---| -| CI pipeline green on the tag-target commit | GitHub Actions web UI | ☐ | -| `make verify` clean locally | run from repo root | ☐ | -| `go test -race -count=1 ./...` clean | full race check | ☐ | -| `golangci-lint run ./...` clean | local lint | ☐ | -| `govulncheck ./...` clean | vulnerability scan | ☐ | -| Coverage thresholds met (service ≥55%, handler ≥60%, domain ≥40%, middleware ≥30%) | `go test -coverprofile=cover.out ./... && go tool cover -func=cover.out` | ☐ | -| Frontend type-check + Vitest + Vite build clean | `cd web && npm run typecheck && npm run test && npm run build` | ☐ | - -## Pre-release: manual QA passes - -| Surface | Checklist | Pass | -|---|---|---| -| Local stack boots clean from scratch | `qa-prerequisites.md` Steps 1-4 green | ☐ | -| GUI QA checklist | `gui-qa-checklist.md` end to end | ☐ | -| End-to-end test environment | `test-environment.md` Steps 1-14 green | ☐ | -| Performance baselines | `performance-baselines.md` four spot checks within bounds | ☐ | -| Helm chart deploys clean | `helm-deployment.md` install + verify | ☐ | -| ACME server interop (cert-manager) | `make acme-cert-manager-test` green | ☐ | -| ACME server RFC conformance (lego) | `make acme-rfc-conformance-test` green | ☐ | - -## Release artefact verification - -After the release workflow runs (triggered by tag push), verify the published artefacts: - -| Artefact | How to verify | Pass | -|---|---|---| -| Cosign keyless OIDC signature on `checksums.txt` | per `docs/reference/release-verification.md` step 2 | ☐ | -| SLSA Level 3 provenance on each binary | step 3 | ☐ | -| Container image signature + SBOM + provenance | step 4 | ☐ | -| Release notes published on GitHub Releases page | manual review | ☐ | -| ghcr.io images at `ghcr.io/certctl-io/certctl-{server,agent}:` pullable | `docker pull` round-trips | ☐ | - -## Branch protection + tag push - -| Gate | How to check | Pass | -|---|---|---| -| `master` branch protection rule allows the tag push | Repository Settings → Branches | ☐ | -| Tag pushed | `git tag -s v -m 'Release v'; git push origin v` | ☐ | -| Release workflow kicked off in GitHub Actions | watch the Actions tab | ☐ | - -## Post-release - -| Gate | How to check | Pass | -|---|---|---| -| Release workflow completed without errors | GitHub Actions | ☐ | -| Sample binary downloaded and Cosign-verified by an operator who is not the release author | another team member | ☐ | -| `WORKSPACE-CHANGELOG.md` notes the tag commit SHA | manual edit | ☐ | -| workspace-tracking "Active Focus" → "Current tag" updated | manual edit | ☐ | -| `certctl.io/index.html` star count + `data-gh-version` rendering picks up the new tag | open the landing page in 6+ hours (cache TTL) | ☐ | -| Reddit / Hacker News / LinkedIn announcement drafted (if a major release) | per the operator's promotion playbook | ☐ | - -## If a gate fails - -Revert the tag push immediately: - -```bash -git push --delete origin v -git tag -d v -``` - -Investigate, fix, re-tag. - -## Related docs - -- [`docs/contributor/qa-prerequisites.md`](qa-prerequisites.md) — local stack prereqs -- [`docs/contributor/test-environment.md`](test-environment.md) — full local environment tutorial -- [`docs/contributor/gui-qa-checklist.md`](gui-qa-checklist.md) — GUI manual QA pass -- [`docs/contributor/testing-strategy.md`](testing-strategy.md) — what we test in CI vs deep-scan vs manual QA -- [`docs/contributor/ci-pipeline.md`](ci-pipeline.md) — CI shape and regression guards -- [`docs/operator/performance-baselines.md`](../operator/performance-baselines.md) — performance regression spot checks -- [`docs/operator/helm-deployment.md`](../operator/helm-deployment.md) — Helm install + verify -- [`docs/reference/release-verification.md`](../reference/release-verification.md) — Cosign / SLSA / SBOM verification procedure diff --git a/docs/contributor/test-environment.md b/docs/contributor/test-environment.md deleted file mode 100644 index ed19964..0000000 --- a/docs/contributor/test-environment.md +++ /dev/null @@ -1,1103 +0,0 @@ -# certctl Testing Environment - -> Last reviewed: 2026-05-05 - -A step-by-step guide to running certctl locally with real certificate authorities. Every command is spelled out. Every expected output is shown. If something goes wrong, the troubleshooting section tells you exactly what to check. - ---- - -## What Is This? - -certctl manages TLS certificates — the things that put the padlock icon in your browser. This test environment lets you run the entire platform on your laptop so you can see it work end-to-end: create a certificate, have it signed by a CA, deploy it to a web server, and watch the dashboard track everything. - -You'll start 7 Docker containers that talk to each other: - -| Container | What it does | IP Address | You access it at | -|---|---|---|---| -| **PostgreSQL** | Stores all certctl data (certs, jobs, agents, audit trail) | 10.30.50.2 | Not directly — internal only | -| **pebble-challtestsrv** | DNS/HTTP challenge test server for Pebble | 10.30.50.3 | Not directly — Pebble talks to it | -| **Pebble** | A fake Let's Encrypt (tests the ACME protocol without touching the real internet) | 10.30.50.4 | Not directly — the server talks to it | -| **step-ca** | A private Certificate Authority (think: your company's internal CA) | 10.30.50.5 | Not directly — the server talks to it | -| **certctl-server** | The brain. API + web dashboard + scheduler + ACME challenge server | 10.30.50.6 | **https://localhost:8443** (self-signed — see CA-bundle note below) | -| **NGINX** | A web server. The agent deploys certificates here. | 10.30.50.7 | **https://localhost:8444** | -| **certctl-agent** | The hands. Generates keys, deploys certs to NGINX | 10.30.50.8 | Not directly — it talks to the server | - -**Why 7 containers?** Because certctl sits between CAs (who sign certificates) and servers (who use certificates). To test the full flow, you need at least one CA and one server. We include two different CAs (Pebble for ACME, step-ca for private CA) plus a third built-in one (Local CA) so you can test all three issuance methods. - -**Why static IPs?** Pebble uses challtestsrv as its DNS server (it needs to know the IP). challtestsrv resolves all domains to the certctl-server (10.30.50.6) so Pebble can validate HTTP-01 challenges. Static IPs avoid DNS race conditions during startup. - ---- - -## Before You Start - -### Install Docker Desktop - -If you don't have Docker yet: - -1. Go to [https://www.docker.com/products/docker-desktop/](https://www.docker.com/products/docker-desktop/) -2. Download Docker Desktop for your OS (Mac, Windows, or Linux) -3. Install it and open it -4. Wait for the Docker icon in your menu bar/taskbar to say "Docker Desktop is running" - -Verify it works by opening a terminal and running: - -```bash -docker --version -``` - -You should see something like: - -``` -Docker version 27.x.x, build xxxxxxx -``` - -If you get "command not found", Docker isn't installed or isn't in your PATH. Restart your terminal and try again. - -Also verify Docker Compose is available: - -```bash -docker compose version -``` - -You should see: - -``` -Docker Compose version v2.x.x -``` - -If this says "command not found", you have an old Docker version. Update Docker Desktop. - -### Make Sure You Have the certctl Repo - -You need the certctl source code on your machine. If you haven't cloned it yet: - -```bash -git clone https://github.com/certctl-io/certctl.git -cd certctl -``` - -If you already have it, make sure you're on the latest version: - -```bash -cd certctl -git pull -``` - ---- - -## Step 1: Start Everything - -Open a terminal. Navigate to the `deploy` directory inside the certctl repo: - -```bash -cd certctl/deploy -``` - -Verify you're in the right place: - -```bash -ls docker-compose.test.yml -``` - -You should see: - -``` -docker-compose.test.yml -``` - -If you see "No such file or directory", you're in the wrong directory. Run `pwd` to see where you are, then navigate to the correct path. - -Now start the test environment: - -```bash -docker compose -f docker-compose.test.yml up --build -``` - -**What this does**: Builds the certctl server and agent from source code (compiles Go + React), downloads Docker images for PostgreSQL, NGINX, Pebble, and step-ca, then starts all 7 containers. - -**First run takes 2-5 minutes** because it has to: -- Download ~2 GB of Docker images -- Compile the Go server binary -- Compile the React frontend -- Wait for each service to become healthy - -**What you'll see**: A wall of colored log lines from all 7 containers. This is normal. You're looking for lines like: - -``` -certctl-test-server | {"level":"INFO","msg":"server started","address":"0.0.0.0:8443"} -certctl-test-agent | {"level":"INFO","msg":"agent starting","server_url":"https://certctl-server:8443"} -certctl-test-stepca | Serving HTTPS on :9000 ... -certctl-test-pebble | Listening on: 0.0.0.0:14000 -``` - -**Leave this terminal running.** The logs will keep scrolling — that's fine. You need a second terminal for the next steps. - -### Open a Second Terminal - -Open a new terminal window or tab. Navigate to the deploy directory again: - -```bash -cd certctl/deploy -``` - -Check that all containers are up: - -```bash -docker compose -f docker-compose.test.yml ps -``` - -You should see 7 services. The important thing is that none say `Exit` or `Restarting`: - -``` -NAME STATUS -certctl-test-agent Up -certctl-test-challtestsrv Up -certctl-test-nginx Up (healthy) -certctl-test-pebble Up -certctl-test-postgres Up (healthy) -certctl-test-server Up (healthy) -certctl-test-stepca Up (healthy) -``` - -**If certctl-test-server says "Restarting"**: It probably started before step-ca or Pebble were ready. Wait 30 seconds and check again. If it keeps restarting, see [Troubleshooting](#troubleshooting). - -### Get the CA bundle for curl - -The test harness runs HTTPS-only (the `certctl-tls-init` init container self-signs an ECDSA-P256 server cert with a SHA-256 signature into a bind-mounted directory before the server starts — see `docker-compose.test.yml` §`certctl-tls-init` for details). The CA cert that signed it is materialized on the host at `./test/certs/ca.crt` (relative to the `deploy/` directory). Every `curl` in the rest of this doc expects it in `$CA`: - -```bash -export CA=$PWD/test/certs/ca.crt -ls -la "$CA" # sanity check: file should exist and be non-empty -curl --cacert "$CA" -f https://localhost:8443/health -``` - -Expect `{"status":"ok"}`. If `curl` errors with `SSL certificate problem: unable to get local issuer certificate`, the init container hasn't finished yet — wait a few seconds and retry. If the file doesn't exist at all, the bind mount didn't populate; `docker compose -f docker-compose.test.yml logs certctl-tls-init` should show the self-sign ran. - -For a full explanation of the cert provisioning patterns (self-signed bootstrap, operator-supplied, cert-manager), see [`tls.md`](../operator/tls.md). For the one-step cutover from the old plaintext test harness to HTTPS, see [`upgrade-to-tls.md`](../archive/upgrades/to-tls-v2.2.md). - ---- - -## Step 2: Open the Dashboard - -Open your web browser and go to: - -**https://localhost:8443** - -Your browser will warn you that the cert is self-signed ("Your connection is not private" / "NET::ERR_CERT_AUTHORITY_INVALID"). That's expected for the test harness — the CA that signed the cert lives at `deploy/test/certs/ca.crt` and isn't in your system trust store. Click through the warning (Chrome: "Advanced" → "Proceed"; Firefox: "Accept the Risk"; Safari: "Show Details" → "visit this website"). - -You'll see a login screen asking for an API key. Enter: - -``` -test-key-2026 -``` - -Click "Login" (or press Enter). - -**What you should see**: The certctl dashboard. It will be mostly empty because we haven't created any certificates yet. That's expected — you're looking at a clean environment. - -You should see a sidebar on the left with navigation items like Dashboard, Certificates, Jobs, Agents, Issuers, Targets, etc. - -**If the page doesn't load**: The server might still be starting. Wait 30 seconds and refresh. Check that `certctl-test-server` shows "healthy" in `docker compose ps`. - -**If you get "Unauthorized"**: Make sure you typed the API key exactly: `test-key-2026` - ---- - -## Step 3: Verify the Pre-Seeded Data - -The test environment comes with **pre-seeded data** in the database. This gives you everything you need to start testing immediately — an agent, an owner, a team, three issuers (one per CA), a certificate profile, and an NGINX deployment target. No manual setup required. - -The seed data comes from two files: -- `migrations/seed.sql` — default renewal policy and policy rules (loaded in all environments) -- `migrations/seed_test.sql` — test-specific data: team, owner, agent, issuers, profile, and NGINX target - -Go back to your second terminal. Let's verify the data loaded correctly. - -### Check the agent - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/agents | python3 -m json.tool -``` - -**What this command does**: -- `curl` makes an HTTPS request (like a browser but from the terminal) -- `--cacert "$CA"` pins the test harness's self-signed root as the only trust anchor for this call — matches what you exported in Step 1 -- `-s` means "silent" (don't show progress bars) -- `-H "Authorization: Bearer test-key-2026"` sends the API key (same one you used to log in) -- `python3 -m json.tool` formats the JSON response so it's readable - -**What you should see**: A JSON response showing agents, including `agent-test-01`: - -```json -{ - "agents": [ - { - "id": "agent-test-01", - "name": "test-agent-01", - "status": "online", - ... - } - ], - ... -} -``` - -The important parts: `"id": "agent-test-01"` and `"status": "online"`. If the status says `"online"`, the agent container has already sent its first heartbeat to the server. - -**If the status is still "offline"**: The agent container hasn't finished starting. Wait 30 seconds and try again. The agent sends a heartbeat every 60 seconds. - -**If you get "Connection refused"**: The server isn't running. Run `docker compose -f docker-compose.test.yml ps` and check the server status. - -### Check the issuers - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/issuers | python3 -m json.tool -``` - -You should see three issuers: -- `iss-local` — Local CA (Self-Signed) -- `iss-acme-staging` — ACME (Pebble Test CA) -- `iss-stepca` — step-ca (Private CA) - -### Check the target - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/targets | python3 -m json.tool -``` - -You should see `target-test-nginx` — the NGINX deployment target, assigned to `agent-test-01`. - -The target config uses no-op commands for `reload_command` and `validate_command` (both set to `"true"`, the Unix command that always succeeds). This is because the agent runs in a separate container from NGINX — it can't directly run `nginx -s reload`. Instead, the agent writes cert files to a shared Docker volume, and we reload NGINX manually (or via the test script). - -### See it all in the dashboard - -Open the dashboard at https://localhost:8443 and click through the sidebar: -- **Agents** — you should see `test-agent-01` -- **Issuers** — you should see all three CAs -- **Targets** — you should see `Test NGINX` - -Everything is wired up. The agent knows about the server, the server knows about the agent, and the NGINX target is linked to the agent. Time to issue certificates. - ---- - -## Step 4: Issue Your First Certificate (Local CA) - -Now the good part. You're going to create a certificate record and trigger issuance. Here's what will happen behind the scenes: - -1. You tell the server "I want a certificate for local.certctl.test" -2. The server creates an issuance **job** (status: AwaitingCSR) and waits -3. The agent **polls** the server for work (every 30 seconds) -4. The agent sees the job, **generates an ECDSA P-256 key pair** locally -5. The agent creates a **CSR** (Certificate Signing Request) containing the public key — NOT the private key -6. The agent submits the CSR to the server -7. The server forwards the CSR to the **Local CA** issuer, which signs it -8. The server stores the signed certificate and creates a **deployment job** (status: Pending) -9. The agent picks up the deployment job, fetches the signed cert, reads the local private key -10. The agent writes cert + key + chain to the shared NGINX volume (`/nginx-certs/`) -11. You reload NGINX, and it starts serving the new certificate - -The private key **never leaves the agent**. The server only ever sees the CSR (public key + metadata). - -**Important**: The deployment job is routed to the specific agent via `agent_id`. The server's job processor skips deployment jobs that have an `agent_id` set — those are exclusively for the agent to pick up via polling. This prevents a race condition where the server would set the job to "Running" before the agent could see it. - -### Step 4a: Create the certificate record - -```bash -curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/certificates \ - -H "Authorization: Bearer test-key-2026" \ - -H "Content-Type: application/json" \ - -d '{ - "id": "mc-local-test", - "name": "local-test-cert", - "common_name": "local.certctl.test", - "sans": ["local.certctl.test"], - "issuer_id": "iss-local", - "owner_id": "owner-test-admin", - "team_id": "team-test-ops", - "renewal_policy_id": "rp-default", - "certificate_profile_id": "prof-test-tls", - "environment": "development" - }' | python3 -m json.tool -``` - -**What each field means**: -- `id`: Unique certificate identifier (you choose this; the `mc-` prefix is convention for "managed certificate") -- `name`: Human-readable display name (must be unique across all certs) -- `common_name`: The domain name for the certificate. Doesn't need to be a real domain for testing. -- `sans`: Subject Alternative Names — additional domain names the cert is valid for. Always include the common_name here too. -- `issuer_id`: Which CA should sign this cert. `iss-local` is the built-in self-signed CA (pre-seeded in Step 3). -- `owner_id`: Who owns this certificate. `owner-test-admin` was pre-seeded. This controls notification routing. -- `team_id`: Which team is responsible. `team-test-ops` was pre-seeded. Used for organizational grouping. -- `renewal_policy_id`: The renewal rules to follow. `rp-default` was created by seed.sql — 30-day renewal window, auto-renew enabled, alert at 30/14/7/0 days before expiry. -- `certificate_profile_id`: Crypto constraints. `prof-test-tls` allows ECDSA P-256 and RSA-2048 keys, 90-day max TTL, serverAuth EKU. -- `environment`: A label for organization (development, staging, production) - -**What you should see**: The certificate record echoed back as JSON with `"status": "pending"`. - -**If you get a 400 error** with a message about a missing field: double-check that every field in the JSON above is present. The API requires `name`, `common_name`, `owner_id`, `team_id`, `issuer_id`, and `renewal_policy_id` — all of them. - -This just creates the record. The certificate isn't issued yet. - -### Step 4b: Link it to the NGINX target - -The certificate record exists, but certctl doesn't know WHERE to deploy it yet. We need to create a mapping in the `certificate_target_mappings` table that says "deploy this cert to this target." This is done via SQL (the API doesn't expose a mapping endpoint): - -```bash -docker exec certctl-test-postgres psql -U certctl -d certctl -c \ - "INSERT INTO certificate_target_mappings (certificate_id, target_id) VALUES ('mc-local-test', 'target-test-nginx') ON CONFLICT DO NOTHING;" -``` - -**What this does**: Inserts a row into the join table that links your certificate to the NGINX target. When certctl creates deployment jobs, it queries this table to figure out where to deploy. - -**If you get "connection refused"**: The postgres container isn't running. Check `docker compose ps`. - -### Step 4c: Trigger issuance - -```bash -curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/certificates/mc-local-test/renew \ - -H "Authorization: Bearer test-key-2026" | python3 -m json.tool -``` - -**What this does**: Tells certctl "issue (or renew) this certificate now." The server creates a job, and the background system takes over. - -**What you should see**: A JSON response confirming the job was created. - -### Step 4d: Watch it happen - -Switch to your first terminal (the one running `docker compose up`) and watch the logs. You should see a sequence like this (simplified): - -``` -certctl-test-server | "msg":"created renewal job" ... -certctl-test-agent | "msg":"polling for work" ... -certctl-test-agent | "msg":"generating ECDSA P-256 key pair" ... -certctl-test-agent | "msg":"submitting CSR" ... -certctl-test-server | "msg":"CSR received, forwarding to issuer" ... -certctl-test-server | "msg":"certificate signed by Local CA" ... -certctl-test-agent | "msg":"deploying certificate to target" ... -certctl-test-agent | "msg":"deployment complete" ... -``` - -This takes about 30-60 seconds because the agent polls for work every 30 seconds. - -### Step 4e: Reload NGINX and verify - -The agent writes cert files to the shared volume, but NGINX doesn't automatically detect the change (the agent's reload command is a no-op in this test setup). Reload NGINX manually: - -```bash -docker exec certctl-test-nginx nginx -s reload -``` - -Wait a few seconds, then check what certificate NGINX is now serving: - -```bash -echo | openssl s_client -connect localhost:8444 -servername local.certctl.test 2>/dev/null \ - | openssl x509 -noout -subject -issuer -dates -``` - -**What this command does**: Connects to NGINX on port 8444 (HTTPS), downloads the certificate it presents, and prints the subject (who the cert is for), issuer (who signed it), and validity dates. - -**What you should see**: - -``` -subject=CN=local.certctl.test -issuer=CN=certctl Local CA -notBefore=... -notAfter=... -``` - -The `subject` should match the domain name you chose. The `issuer` should say "certctl Local CA". The dates should show it was just issued (today) and expires in about 90 days. - -**If you see the old self-signed placeholder cert** (issuer says something like `CN=placeholder.certctl.test`): The deployment hasn't happened yet. Wait another 30 seconds for the agent to poll, then reload NGINX and try again. Check the agent logs for errors. - -### Step 4f: Check the dashboard - -Open the dashboard at https://localhost:8443 and: - -1. Click **Certificates** in the sidebar — you should see `mc-local-test` with status "Active" -2. Click on it to see the detail page — you should see version history, the signed certificate details, and the deployment timeline -3. Click **Jobs** — you should see the issuance and deployment jobs with their statuses - ---- - -## Step 5: Issue a Certificate via ACME (Pebble) - -This is the real deal. ACME is the protocol that Let's Encrypt uses to issue certificates automatically. Pebble is a test ACME server that runs locally — it does everything real Let's Encrypt does, just without the internet. - -**How it works behind the scenes**: When you trigger issuance, certctl talks to Pebble and says "I want a cert for acme.certctl.test." Pebble says "prove you control that domain — serve this random token at `http://acme.certctl.test/.well-known/acme-challenge/`." certctl starts a temporary HTTP server on port 80 inside the certctl-server container (10.30.50.6) to serve the token. Meanwhile, Pebble resolves `acme.certctl.test` via challtestsrv, which is configured to return 10.30.50.6 for ALL domains. So Pebble connects to the certctl-server on port 80, finds the challenge token, and validates. It's all self-contained within the Docker network. - -**Key detail**: The `CERTCTL_ACME_INSECURE=true` env var is set on the server because Pebble uses a self-signed TLS certificate on its ACME directory endpoint (port 14000). Without this flag, Go's HTTP client would reject the connection. This is only for test environments — never use this in production. - -### Step 5a: Create the certificate record - -```bash -curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/certificates \ - -H "Authorization: Bearer test-key-2026" \ - -H "Content-Type: application/json" \ - -d '{ - "id": "mc-acme-test", - "name": "acme-test-cert", - "common_name": "acme.certctl.test", - "sans": ["acme.certctl.test"], - "issuer_id": "iss-acme-staging", - "owner_id": "owner-test-admin", - "team_id": "team-test-ops", - "renewal_policy_id": "rp-default", - "certificate_profile_id": "prof-test-tls", - "environment": "staging" - }' | python3 -m json.tool -``` - -Notice `issuer_id` is `iss-acme-staging` this time — that routes to Pebble instead of the Local CA. - -### Step 5b: Link to target and trigger issuance - -```bash -# Link to NGINX target (same SQL pattern as Step 4b) -docker exec certctl-test-postgres psql -U certctl -d certctl -c \ - "INSERT INTO certificate_target_mappings (certificate_id, target_id) VALUES ('mc-acme-test', 'target-test-nginx') ON CONFLICT DO NOTHING;" - -# Trigger issuance -curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/certificates/mc-acme-test/renew \ - -H "Authorization: Bearer test-key-2026" | python3 -m json.tool -``` - -### Step 5c: Watch the ACME exchange - -In your first terminal (the log stream), watch for ACME-related messages: - -``` -certctl-test-server | "msg":"ACME order created" ... -certctl-test-server | "msg":"solving HTTP-01 challenge" ... -certctl-test-server | "msg":"challenge server started","address":":80" ... -certctl-test-server | "msg":"challenge validated" ... -certctl-test-server | "msg":"certificate issued via ACME" ... -``` - -This takes a bit longer than Local CA (maybe 30-60 seconds for the challenge validation plus the agent poll cycle). - -### Step 5d: Reload NGINX and verify - -```bash -docker exec certctl-test-nginx nginx -s reload -sleep 3 - -echo | openssl s_client -connect localhost:8444 -servername acme.certctl.test 2>/dev/null \ - | openssl x509 -noout -subject -issuer -dates -``` - -The issuer should now say something like "Pebble Intermediate CA" instead of "certctl Local CA". - -**If issuance fails**: Check the server logs with `docker logs certctl-test-server --tail 50`. Look for ACME-related errors. Common issues: "x509: certificate signed by unknown authority" (Pebble trust issue — the `setup-trust.sh` script should handle this, but CERTCTL_ACME_INSECURE=true is the belt-and-suspenders fix). - ---- - -## Step 6: step-ca (Private CA) - -step-ca is a private CA by Smallstep. Companies use it for internal certificates (things that don't need to be publicly trusted). Unlike ACME, step-ca doesn't do challenge validation — it uses a provisioner key for authentication. - -The step-ca connector now supports proper JWE decryption of the provisioner key (PBES2-HS256+A128KW) and JWT-based authentication against step-ca's `/sign` API. The production code is fully functional. - -**Test environment status**: The automated test script fully tests step-ca issuance (Phase 6). The `setup-trust.sh` script extracts the provisioner key from step-ca's `ca.json` configuration and copies it to the server container. The step-ca connector decrypts the JWE-encrypted provisioner key, generates JWT auth tokens, and issues certificates via the native `/sign` API. - -You can verify step-ca is healthy: - -```bash -docker exec certctl-test-server curl -sk https://step-ca:9000/health -``` - -You should see `{"status":"ok"}`. - -**Alternative**: step-ca also supports ACME. You can configure it as an ACME issuer pointing to `https://step-ca:9000/acme/acme/directory` instead of using the native `/sign` API. - ---- - -## Step 7: Test Revocation - -Revocation means "this certificate is no longer trusted, even though it hasn't expired yet." You'd do this if a private key was compromised, a server was decommissioned, or a cert was superseded by a new one. - -### Step 7a: Revoke the Local CA cert - -```bash -curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/certificates/mc-local-test/revoke \ - -H "Authorization: Bearer test-key-2026" \ - -H "Content-Type: application/json" \ - -d '{"reason": "superseded"}' | python3 -m json.tool -``` - -**What `"reason": "superseded"` means**: You're telling the system WHY you're revoking. These reasons come from RFC 5280 (the TLS certificate standard). Other valid reasons: `keyCompromise`, `affiliationChanged`, `cessationOfOperation`, `certificateHold`, `privilegeWithdrawn`. - -### Step 7b: Check the CRL (Certificate Revocation List) - -The CRL is a DER-encoded X.509 v2 CRL (RFC 5280 §5) served under the RFC 8615 well-known namespace. It is deliberately unauthenticated — relying parties that need to verify revocation don't have certctl API keys. - -```bash -# No Authorization header — the endpoint is public by design. -curl --cacert "$CA" -s https://localhost:8443/.well-known/pki/crl/iss-local -o /tmp/crl.der -openssl crl -inform der -in /tmp/crl.der -noout -text | head -40 -``` - -**What you should see**: `openssl` prints the CRL issuer DN, `This Update` / `Next Update` timestamps, and at least one entry whose `Serial Number` matches the cert you just revoked, with `CRL Reason Code: Superseded` (or whichever reason you passed in step 7a). The response's `Content-Type` header is `application/pkix-crl`. - -### Step 7c: Check in the dashboard - -Go to **Certificates** in the sidebar. The `mc-local-test` cert should now show "Revoked" status with a red indicator. Click on it — the detail page should show a revocation banner with the reason and timestamp. - ---- - -## Step 8: Test Discovery - -The agent is configured to scan `/nginx-certs` every 6 hours for existing certificates. It already ran a scan when it started up. Let's see what it found. - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/discovered-certificates | python3 -m json.tool -``` - -**What you should see**: Any certificates that exist in the NGINX cert directory, including the ones you deployed in Steps 4-5. The discovery system extracts metadata (CN, SANs, issuer, expiry, fingerprint) from the PEM files. - -Check the summary: - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/discovery-summary | python3 -m json.tool -``` - -This shows counts: how many are Unmanaged, Managed, and Dismissed. - -In the dashboard: click **Discovery** in the sidebar to see the triage view. - ---- - -## Step 9: Test Renewal - -Force a renewal on the ACME certificate to see the full cycle happen again: - -```bash -curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/certificates/mc-acme-test/renew \ - -H "Authorization: Bearer test-key-2026" | python3 -m json.tool -``` - -After 30-90 seconds (agent poll + ACME challenge validation), reload NGINX and check: - -```bash -docker exec certctl-test-nginx nginx -s reload -sleep 3 - -echo | openssl s_client -connect localhost:8444 -servername acme.certctl.test 2>/dev/null \ - | openssl x509 -noout -subject -issuer -dates -serial -``` - -Go to **Certificates** in the dashboard, click on `mc-acme-test`, and look at the **Version History** section. You should see two versions now — the original and the renewal. The newer one should have a "Current" badge. - ---- - -## Step 10: Test EST Enrollment (RFC 7030) - -EST (Enrollment over Secure Transport) is a standard protocol for certificate enrollment used by devices, WiFi networks (802.1X), MDM systems, and IoT. The certctl server includes a built-in EST server that delegates to whichever issuer you configure. - -The test environment enables EST with `CERTCTL_EST_ENABLED=true` and `CERTCTL_EST_ISSUER_ID=iss-local`, meaning EST enrollments are signed by the Local CA. - -### Step 10a: Check available CA certificates - -```bash -curl --cacert "$CA" -s https://localhost:8443/.well-known/est/cacerts \ - -H "Authorization: Bearer test-key-2026" -``` - -**What this does**: Requests the CA certificate chain in PKCS#7 format (base64-encoded DER). This is the EST equivalent of "show me your trust anchor." - -**What you should see**: A base64-encoded blob. This is a degenerate PKCS#7 SignedData structure containing the Local CA's certificate. - -### Step 10b: Check CSR attributes - -```bash -curl --cacert "$CA" -s https://localhost:8443/.well-known/est/csrattrs \ - -H "Authorization: Bearer test-key-2026" -``` - -This returns the CSR attributes the server expects. It may return an empty response if no specific attributes are required — that's normal for the Local CA. - -### Step 10c: Enroll a certificate via EST - -Generate a CSR and submit it: - -```bash -# Generate a key pair and CSR -openssl req -new -newkey ec -pkeyopt ec_paramgen_curve:P-256 \ - -keyout /tmp/est-test.key -out /tmp/est-test.csr -nodes \ - -subj "/CN=est-device.certctl.test" 2>/dev/null - -# Convert CSR to base64-encoded DER (EST wire format) -EST_CSR=$(openssl req -in /tmp/est-test.csr -outform DER | base64 -w 0) - -# Submit to EST simpleenroll endpoint -curl --cacert "$CA" -s -X POST https://localhost:8443/.well-known/est/simpleenroll \ - -H "Authorization: Bearer test-key-2026" \ - -H "Content-Type: application/pkcs10" \ - -d "$EST_CSR" -``` - -**What you should see**: A base64-encoded PKCS#7 response containing the signed certificate. The Local CA signed your CSR without any challenge validation (it trusts the API key). - -### Step 10d: Verify the issued certificate - -Decode and inspect the response (if you saved it to a variable): - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/audit-events | python3 -m json.tool | head -30 -``` - -Check the audit trail — you should see an `est_enrollment` event with the CN `est-device.certctl.test`. - -### Step 10e: Re-enroll (simplereenroll) - -EST also supports re-enrollment (certificate renewal). The same CSR format works: - -```bash -curl --cacert "$CA" -s -X POST https://localhost:8443/.well-known/est/simplereenroll \ - -H "Authorization: Bearer test-key-2026" \ - -H "Content-Type: application/pkcs10" \ - -d "$EST_CSR" -``` - -This should return another signed certificate. - ---- - -## Step 11: Test S/MIME Certificate Issuance - -S/MIME certificates are used for email signing and encryption — a different use case from TLS server certificates. The test environment includes a pre-seeded S/MIME profile (`prof-test-smime`) with the `emailProtection` Extended Key Usage (EKU). - -**How it differs from TLS**: TLS certs use `serverAuth` EKU and `KeyUsage: DigitalSignature | KeyEncipherment`. S/MIME certs use `emailProtection` EKU and `KeyUsage: DigitalSignature | ContentCommitment` (formerly NonRepudiation). The Local CA issuer adapts its KeyUsage flags based on the EKU — this is the "adaptive KeyUsage" feature. - -### Step 11a: Create an S/MIME certificate record - -```bash -curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/certificates \ - -H "Authorization: Bearer test-key-2026" \ - -H "Content-Type: application/json" \ - -d '{ - "id": "mc-smime-test", - "name": "smime-test-cert", - "common_name": "testuser@certctl.test", - "sans": ["testuser@certctl.test"], - "issuer_id": "iss-local", - "owner_id": "owner-test-admin", - "team_id": "team-test-ops", - "renewal_policy_id": "rp-default", - "certificate_profile_id": "prof-test-smime", - "environment": "development" - }' | python3 -m json.tool -``` - -Notice: -- `common_name` is an email address, not a domain -- `sans` contains the email address (the agent's CSR builder routes email SANs to the `EmailAddresses` field instead of `DNSNames`) -- `certificate_profile_id` is `prof-test-smime` (not `prof-test-tls`) - -### Step 11b: Link to target and trigger issuance - -```bash -docker exec certctl-test-postgres psql -U certctl -d certctl -c \ - "INSERT INTO certificate_target_mappings (certificate_id, target_id) VALUES ('mc-smime-test', 'target-test-nginx') ON CONFLICT DO NOTHING;" - -curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/certificates/mc-smime-test/renew \ - -H "Authorization: Bearer test-key-2026" | python3 -m json.tool -``` - -### Step 11c: Verify the S/MIME certificate - -After the agent processes the job (30-60 seconds), check the certificate details: - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/certificates/mc-smime-test | python3 -m json.tool -``` - -The certificate should show `"status": "active"`. To verify the EKU on the actual cert, you can export it: - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/certificates/mc-smime-test/export/pem | python3 -m json.tool -``` - -If you decode the certificate PEM, you should see: -- **Extended Key Usage**: `E-mail Protection` (OID 1.3.6.1.5.5.7.3.4) -- **Key Usage**: `Digital Signature, Non Repudiation` (not KeyEncipherment) -- **Subject Alternative Name**: `email:testuser@certctl.test` - ---- - -## Step 12: Explore the Dashboard - -Now that you have real data from TLS, ACME, EST, and S/MIME tests, poke around the dashboard: - -- **Dashboard** (home page): Charts showing certificate status distribution, expiration timeline, job trends, and issuance rate. These populate based on the certs and jobs you just created. -- **Certificates**: List of all certificates. Click one to see full details, version history, deployment timeline, and the revoke/export buttons. -- **Jobs**: Every action (issuance, renewal, deployment) creates a job. You can see the full history with status transitions. Click a job ID to see its detail page with verification status. -- **Agents**: Shows `test-agent-01` with its heartbeat status, OS info, architecture, and IP address. -- **Issuers**: Shows the three active issuers (Local CA, ACME/Pebble, step-ca). Click one to see its configuration and the certificates it has issued. -- **Targets**: Shows the NGINX target with its configuration and deployment history. -- **Discovery**: Triage view for discovered certificates. You can claim them (link to a managed cert) or dismiss them. -- **Audit**: Every API call is recorded. You can filter by time range, actor, and action type. Try exporting as CSV or JSON. -- **Observability**: Health status, metrics gauges, and Prometheus scrape configuration. - ---- - -## Step 13: Run the Automated Test Script - -The repo includes a comprehensive test script that automates everything in Steps 4-11 plus additional API spot checks: - -```bash -cd certctl/deploy -bash test/run-test.sh -``` - -**What it does** (13 phases): -1. **Phase 0**: Checks prerequisites (Docker, curl, openssl, python3) -2. **Phase 1**: Starts the Docker Compose environment (or reuses if running) -3. **Phase 2**: Waits for all services to become healthy -4. **Phase 3**: Verifies pre-seeded data (agents, issuers, targets, profiles — including `prof-test-smime`) -5. **Phase 4**: Issues a certificate via Local CA, deploys to NGINX, verifies TLS -6. **Phase 5**: Issues a certificate via ACME/Pebble (full HTTP-01 challenge flow) -7. **Phase 6**: step-ca issuance via native `/sign` API with JWK provisioner auth -8. **Phase 7**: Revokes the Local CA cert, checks CRL -9. **Phase 8**: Checks discovery results -10. **Phase 9**: Tests renewal on the ACME cert -11. **Phase 10**: EST enrollment — tests `cacerts`, `csrattrs`, `simpleenroll` (generates CSR, submits base64 DER), and `simplereenroll` -12. **Phase 11**: S/MIME issuance — creates cert with `prof-test-smime` profile and `emailProtection` EKU, verifies the issued cert has the correct EKU, KeyUsage (Digital Signature, not KeyEncipherment), and email SAN -13. **Phase 12**: API spot checks (health, metrics, stats, audit, Prometheus) - -The script prints PASS/FAIL/SKIP for each check. At the end, you get a summary with total counts. - -**Note on NGINX reloads**: The test script runs `docker exec certctl-test-nginx nginx -s reload` after each deployment phase because the agent's reload command is a no-op (agent and NGINX are separate containers with a shared volume). - ---- - -## Step 14: Test via the CLI (Optional) - -If you have Go installed, you can build and test the CLI tool: - -```bash -# From the certctl repo root -go build -o certctl-cli ./cmd/cli - -# List certificates -./certctl-cli --server https://localhost:8443 --ca-bundle "$CA" --api-key test-key-2026 list-certs - -# Get a specific certificate -./certctl-cli --server https://localhost:8443 --ca-bundle "$CA" --api-key test-key-2026 get-cert mc-acme-test - -# Check health -./certctl-cli --server https://localhost:8443 --ca-bundle "$CA" --api-key test-key-2026 health - -# Get metrics (JSON format) -./certctl-cli --server https://localhost:8443 --ca-bundle "$CA" --api-key test-key-2026 --format json metrics -``` - ---- - -## Architecture Notes (For Experts) - -### Container Network Topology - -All containers share a bridge network (`certctl-test`, subnet 10.30.50.0/24) with static IPs. This is required because: - -- **Pebble** uses challtestsrv as its DNS server (configured via `-dnsserver 10.30.50.3:8053`) -- **challtestsrv** resolves ALL domains to 10.30.50.6 (certctl-server) for HTTP-01 challenge validation -- **Pebble** validates challenges by connecting to the resolved IP on port 80 (configured in `pebble-config.json` with `"httpPort": 80`) - -### Key Generation Flow (Agent-Side) - -```mermaid -sequenceDiagram - autonumber - participant Srv as certctl-server - participant Iss as Issuer connector - participant Agt as certctl-agent - participant FS as /var/lib/certctl/keys/
(local agent FS) - participant Vol as /nginx-certs/
(shared volume) - - Srv->>Srv: create Job (AwaitingCSR) - Agt->>Srv: poll for jobs - Srv-->>Agt: Job(AwaitingCSR) - Agt->>FS: generate ECDSA P-256 keypair - Agt->>Agt: build CSR (pubkey + CN + SANs) - Agt->>Srv: POST CSR - Srv->>Iss: sign CSR - Iss-->>Srv: signed cert - Srv->>Srv: store cert; create Deployment Job (Pending) - Agt->>Srv: poll for jobs - Srv-->>Agt: Job(Deployment) - Agt->>Srv: GET signed cert - Agt->>FS: read private key - Agt->>Vol: write cert + key + chain - Agt->>Srv: mark Job(Completed) -``` - -### Shared Volume Architecture - -The `nginx_certs` Docker volume is mounted at different paths in different containers: -- **NGINX** mounts it at `/etc/nginx/certs/` (where nginx.conf reads cert.pem and key.pem) -- **Agent** mounts it at `/nginx-certs/` (where the target config tells it to write) - -Same volume, different mount paths. The agent writes to `/nginx-certs/cert.pem` and NGINX reads from `/etc/nginx/certs/cert.pem` — they're the same file. - -### Why NGINX Needs Manual Reload - -The agent and NGINX run in separate containers. The target config's `reload_command` runs inside the agent container, not NGINX. So `reload_command` is set to `"true"` (a no-op). To reload NGINX after the agent deploys a cert, run: - -```bash -docker exec certctl-test-nginx nginx -s reload -``` - -In production, you'd either: (a) run the agent on the same host as NGINX so reload works directly, or (b) use inotify/polling inside the NGINX container to watch the cert directory for changes. - -### Trust Store Setup - -The `setup-trust.sh` script runs inside the certctl-server container at startup: - -1. Fetches Pebble's root CA from its management API (`https://pebble:15000/roots/0`) — this is container-to-container only, port 15000 is **not** exposed to the host -2. Copies step-ca's root CA from the shared volume (`/stepca-data/certs/root_ca.crt`) -3. Runs `update-ca-certificates` to add both to Alpine's trust store -4. Execs the certctl server binary - -This is needed because the ACME and step-ca connectors use Go's default HTTP client (which validates TLS). As a fallback, `CERTCTL_ACME_INSECURE=true` skips TLS verification for the ACME directory specifically. - -### Deployment Job Routing - -Deployment jobs have an `agent_id` field set at creation time (resolved from target → agent relationship). The server's job processor (`ProcessPendingJobs`) skips deployment jobs that have an `agent_id` — those are exclusively for the agent to pick up via `GetPendingWork()`. This prevents a race condition where the server would set the job to "Running" before the agent could see it (the agent's `ListPendingByAgentID` only returns jobs in "Pending" status). - ---- - -## Troubleshooting - -### The server keeps restarting - -**Symptom**: `docker compose ps` shows certctl-test-server with status "Restarting". - -**Why**: The server tried to start before Pebble or step-ca were ready, and the trust store setup failed. - -**Fix**: Wait 30 seconds for Pebble and step-ca to finish starting, then restart just the server: - -```bash -docker compose -f docker-compose.test.yml restart certctl-server -``` - -Then check: - -```bash -docker compose -f docker-compose.test.yml ps -``` - -The server should now show "Up (healthy)". - -### "x509: certificate signed by unknown authority" - -**Symptom**: You see this error in the server logs when trying to issue a cert via ACME. - -**Why**: The server doesn't trust Pebble's CA certificate. The `setup-trust.sh` script should have added it, but Pebble wasn't ready when the server started. - -**Diagnose**: - -```bash -# Check if the CA certs were added to the trust store -docker exec certctl-test-server ls -la /usr/local/share/ca-certificates/ -``` - -You should see `pebble-ca.crt` and `step-ca-root.crt`. If either is missing: - -```bash -# Check if Pebble is reachable from the server container -docker exec certctl-test-server curl -sk https://pebble:15000/roots/0 -``` - -If this prints a PEM certificate, the fetch works but the trust store wasn't updated. Restart the server: - -```bash -docker compose -f docker-compose.test.yml restart certctl-server -``` - -**Fallback**: The `CERTCTL_ACME_INSECURE=true` environment variable is set on the server, which skips TLS verification for the ACME directory. This should prevent this error for ACME. If you still see it, the issue is likely with step-ca's TLS. - -### step-ca issuance fails with "provisioner not found" - -**Symptom**: Server logs show `"provisioner not found or invalid audience"` when trying to issue via step-ca. - -**Why**: The provisioner key path (`CERTCTL_STEPCA_KEY_PATH`) doesn't point to the correct JWE-encrypted key file, or the password (`CERTCTL_STEPCA_PASSWORD`) doesn't match. In the test environment, step-ca auto-bootstraps and stores the provisioner key inside the `stepca_data` Docker volume. The certctl-server mounts this volume read-only at `/stepca-data/`. - -**Fix**: Verify the provisioner key exists at the configured path inside the server container: - -```bash -docker exec certctl-test-server ls -la /stepca-data/secrets/ -``` - -You should see a `provisioner_key` file. If it's missing, step-ca hasn't finished bootstrapping yet — restart the server after step-ca is healthy. - -### Agent isn't picking up jobs - -**Symptom**: You triggered issuance but nothing happens. No deployment, no cert on NGINX. - -**Step 1**: Check agent logs: - -```bash -docker logs certctl-test-agent --tail 50 -``` - -Look for error messages. Common ones: -- "401 Unauthorized" — API key mismatch -- "connection refused" — server isn't running -- "no pending work" — jobs exist but aren't assigned to this agent - -**Step 2**: Verify the agent is registered: - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - https://localhost:8443/api/v1/agents/agent-test-01 | python3 -m json.tool -``` - -**Step 3**: Check for pending jobs: - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - "https://localhost:8443/api/v1/jobs?status=Pending&status=AwaitingCSR" | python3 -m json.tool -``` - -If there are pending jobs but the agent isn't picking them up, check that the job's `agent_id` matches `agent-test-01`. - -**Step 4**: Check if the server's job processor is stealing deployment jobs. Look in server logs for `"skipping agent-routed deployment job"`. If you DON'T see this message but see deployment jobs going to "Running" status, there's a bug in the job processor skip logic. - -### NGINX still shows the placeholder cert - -**Symptom**: After issuance, `openssl s_client` still shows the self-signed placeholder cert (issuer says `CN=placeholder.certctl.test`). - -**Why**: Either the deployment job hasn't run yet, or NGINX needs reloading. - -**Step 1**: Check if the cert files exist with recent timestamps: - -```bash -docker exec certctl-test-nginx ls -la /etc/nginx/certs/ -``` - -You should see `cert.pem`, `key.pem`, and `chain.pem` with recent timestamps (not from when the container first started). - -**Step 2**: If the files are there but NGINX is serving the old cert, force a reload: - -```bash -docker exec certctl-test-nginx nginx -s reload -``` - -**Step 3**: If the files aren't there, the deployment job hasn't completed. Check the jobs: - -```bash -curl --cacert "$CA" -s -H "Authorization: Bearer test-key-2026" \ - "https://localhost:8443/api/v1/jobs?type=Deployment" | python3 -m json.tool -``` - -Look at the job status. If it's "Running" and stuck, the server's job processor may have picked it up instead of the agent (this was a known bug — the fix skips deployment jobs with `agent_id` in the server's `ProcessPendingJobs`). - -### ACME challenge validation fails - -**Symptom**: Server logs show ACME challenge failed or timed out. - -**Diagnose**: - -```bash -# Check that challtestsrv is resolving to certctl-server -docker exec certctl-test-pebble curl -s http://10.30.50.3:8055/dns-request-history -``` - -The challenge server runs on port 80 inside the certctl-server container. Verify it's listening: - -```bash -docker exec certctl-test-server netstat -tlnp 2>/dev/null | grep :80 || \ - docker exec certctl-test-server ss -tlnp | grep :80 -``` - -If the ACME connector hasn't started the challenge server yet (it only starts during issuance), you won't see port 80 listening. Trigger issuance and check again. - -### Port conflict (address already in use) - -**Symptom**: `docker compose up` fails with "Bind for 0.0.0.0:8443 failed: port is already allocated". - -**Why**: Another process is using port 8443 (maybe a previous test run, or another service). - -**Fix**: Either stop the other process, or change the port in docker-compose.test.yml. Find the line: - -```yaml - ports: - - "8443:8443" -``` - -Change it to a different port, like: - -```yaml - ports: - - "9443:8443" -``` - -Then access the dashboard at https://localhost:9443 instead. - -### Starting completely fresh - -If something is really broken, nuke everything and start over: - -```bash -# Stop everything and delete ALL data (database, step-ca state, certs, everything) -docker compose -f docker-compose.test.yml down -v - -# Rebuild from scratch -docker compose -f docker-compose.test.yml up --build -``` - -The `-v` flag deletes all Docker volumes. step-ca will regenerate its root CA. The database will re-seed from scratch. You'll need to redo Steps 4-11. - ---- - -## How to Stop - -When you're done testing: - -```bash -# Stop all containers (keeps data for next time) -docker compose -f docker-compose.test.yml down -``` - -To start again later (without rebuilding): - -```bash -docker compose -f docker-compose.test.yml up -``` - -To start fresh (wipe all data): - -```bash -docker compose -f docker-compose.test.yml down -v -docker compose -f docker-compose.test.yml up --build -``` - ---- - -## Quick Reference - -| What | Value | -|---|---| -| Dashboard URL | https://localhost:8443 (use `--cacert ./test/certs/ca.crt`) | -| API key | `test-key-2026` | -| NGINX HTTP | http://localhost:8080 | -| NGINX HTTPS | https://localhost:8444 | -| Agent ID | `agent-test-01` | -| Local CA issuer | `iss-local` | -| ACME issuer | `iss-acme-staging` | -| step-ca issuer | `iss-stepca` | -| NGINX target | `target-test-nginx` | -| TLS profile | `prof-test-tls` | -| S/MIME profile | `prof-test-smime` | -| Renewal policy | `rp-default` | -| Owner | `owner-test-admin` | -| Team | `team-test-ops` | -| Docker subnet | `10.30.50.0/24` | diff --git a/docs/contributor/testing-strategy.md b/docs/contributor/testing-strategy.md deleted file mode 100644 index 7c2d7cc..0000000 --- a/docs/contributor/testing-strategy.md +++ /dev/null @@ -1,200 +0,0 @@ -# certctl Testing Strategy & Deep-Scan Operator Runbook - -> Last reviewed: 2026-05-05 - -This doc covers the **testing topology** (per-PR fast gates vs. daily deep-scan -gates), and the **operator runbook** for re-running each deep-scan tool locally -when the CI receipt is ambiguous or when an operator wants to validate a fix -before the next scheduled scan. - -For the manual end-to-end QA playbook, see [`testing-guide.md`](../testing-guide.md). -For the security posture / per-finding closure log, see [`security.md`](../operator/security.md). - -## CI workflow split - -certctl runs two GitHub Actions workflows: - -- **`.github/workflows/ci.yml`** — runs on every push/PR. Fast feedback only. - Includes `gofmt`, `go vet`, `golangci-lint`, `go test -short -count=1`, - `govulncheck`, the per-layer coverage gates, and the regression-grep guards - (the M-009 mutation budget, the L-001 InsecureSkipVerify guard, the H-001 - Dockerfile SHA-pin guard, the M-012 USER-directive guard, etc.). -- **`.github/workflows/security-deep-scan.yml`** — runs daily 06:00 UTC and on - manual dispatch. Heavyweight tools that need docker, network egress to - scanner registries, or wall-clock budgets the per-PR check can't tolerate. - Includes `gosec`, `osv-scanner`, the `-race -count=10` full-suite run, - `trivy` image scan, `syft` SBOM, ZAP baseline DAST, `nuclei`, - `schemathesis` OpenAPI fuzz, `testssl.sh`, `go-mutesting` mutation testing, - and `semgrep p/react-security`. - -Receipts from each scheduled run are uploaded as a 30-day-retention artefact -named `security-deep-scan-`. Audit them via the GitHub Actions UI; -download the artefact zip for any scan that surfaces a finding. - -## Operator runbook — local re-run procedures - -These are the same commands the workflow runs, intended for an operator with -a workstation that has docker + the Go toolchain installed. The local-run -shape is identical to CI; the difference is wall-clock and the artefact -location (CI uploads; local writes to `$PWD`). - -### Mutation testing (D-003) - -**Tool:** [`go-mutesting`](https://github.com/zimmski/go-mutesting). Mutates -each AST node in turn (flips comparisons, swaps return values, removes -statements) and re-runs the package's tests. A mutant is **killed** if any -test fails; **surviving** mutants indicate a coverage gap (no test caught -the bug the mutant introduced). - -**Targets:** the three security-critical packages whose coverage gate is -**85%** in `ci.yml`: - -- `internal/crypto/` -- `internal/pkcs7/` -- `internal/connector/issuer/local/` - -**Acceptance threshold:** ≥80% mutation kill ratio per package. Surviving -mutants below that threshold get triaged in -the project's 2026-04-25 mutation-results notes — either -ship a targeted unit test that kills the mutant, or document an -equivalent-mutation justification. - -**Local run:** - -``` -go install github.com/zimmski/go-mutesting/cmd/go-mutesting@latest -for pkg in ./internal/crypto/... ./internal/pkcs7/... ./internal/connector/issuer/local/...; do - echo "=== $pkg ===" - $(go env GOPATH)/bin/go-mutesting "$pkg" -done -``` - -The tool prints one line per mutant (`PASS` = killed, `FAIL` = surviving) -plus a per-package summary `The mutation score is X.YZ`. CPU-bound, single -core, takes ~10 minutes on a 2024-era laptop for the three packages combined. - -**Sandbox note:** `go-mutesting` writes a mutant copy of the source tree to -`/tmp/go-mutesting/` per run; needs ≥2 GB free disk. Sandboxed CI runners -are sized for this; constrained dev sandboxes are not. - -### DAST baseline (D-004) - -**Tool:** [OWASP ZAP `baseline`](https://www.zaproxy.org/docs/docker/baseline-scan/). -Spiders the running server's URL surface and runs the OWASP-ZAP active+passive -rule pack. **Baseline** mode skips the destructive active-scan rules; it's safe -against a non-throwaway environment. - -**Target:** the live `deploy/docker-compose.yml` stack on `https://localhost:8443`. - -**Acceptance:** zero HIGH/CRITICAL alerts. WARN/INFO alerts get triaged in the -ZAP report; some are unavoidable (e.g., HSTS preload-list nag is a deployment -recommendation, not a server defect). - -**Local run:** - -``` -docker compose -f deploy/docker-compose.yml up -d -sleep 20 # wait for /ready to flip OK; check `curl --cacert deploy/test/certs/ca.crt https://localhost:8443/ready` -docker run --rm --network host \ - -v "$PWD":/zap/wrk \ - ghcr.io/zaproxy/zaproxy:stable \ - zap-baseline.py -t https://localhost:8443 \ - -r zap-report.html -J zap-report.json -docker compose -f deploy/docker-compose.yml down -``` - -The HTML report opens in a browser; the JSON is machine-readable for triage. - -### TLS audit (D-005) - -**Tool:** [`testssl.sh`](https://testssl.sh/). Probes the TLS handshake and -each enabled cipher suite; reports protocol-version weaknesses, cipher -weaknesses, certificate-chain issues, and known CVE patterns (Heartbleed, -ROBOT, BEAST, etc.). - -**Target:** the live stack on `https://localhost:8443`. - -**Acceptance:** zero HIGH/CRITICAL findings. certctl pins -`tls.Config.MinVersion = tls.VersionTLS13` (`cmd/server/tls.go`), so anything -that surfaces is either (a) a real defect, (b) a testssl false positive, or -(c) a deployment-config issue worth documenting in the operator runbook. - -**Local run:** - -``` -docker compose -f deploy/docker-compose.yml up -d -sleep 20 -docker run --rm --network host \ - -v "$PWD":/data \ - drwetter/testssl.sh:latest \ - --jsonfile /data/testssl.json https://localhost:8443 -docker compose -f deploy/docker-compose.yml down - -# Filter to actionable severities -jq '[.scanResult[] | select(.severity == "HIGH" or .severity == "CRITICAL")]' testssl.json -``` - -### Frontend semgrep (D-007) - -**Tool:** [`semgrep`](https://semgrep.dev/) with the maintained -[`p/react-security` ruleset](https://semgrep.dev/p/react-security). Catches -React-specific XSS / injection patterns: `dangerouslySetInnerHTML` without -sanitization, `target="_blank"` without `rel="noopener noreferrer"`, -`href={userInput}`, `eval`, `document.write`, etc. - -**Target:** the frontend source tree at `web/src/`. - -**Acceptance:** zero findings. Bundle 8 already verified -`dangerouslySetInnerHTML` count at zero and the `target="_blank"` -rel-noopener pin via simple grep guards in `ci.yml`; semgrep adds defence -in depth — it catches escape patterns the greps don't see (e.g., -`href={user_input}`, runtime `eval`, `document.write`). - -**Local run:** - -``` -docker run --rm -v "$PWD":/src returntocorp/semgrep:latest \ - semgrep --config=p/react-security --json /src/web/src \ - > semgrep-react.json - -# Count findings -jq '.results | length' semgrep-react.json - -# Pretty-print findings -jq '.results[] | {rule_id: .check_id, path, line: .start.line, message: .extra.message}' semgrep-react.json -``` - -If the count is non-zero, every result has a `check_id` (e.g. -`react.dangerouslySetInnerHTML`) and a `message` describing the escape -pattern. Triage each: either fix the call site, or — for legitimate edge -cases — add a `// nosem: ` directive on the -preceding line. - -## Cadence - -| Tool | Trigger | Wall-clock | Owner | -|----------------------|------------------------------------|------------|----------------| -| go-mutesting | daily deep-scan + manual dispatch | ~10 min | maintainers | -| ZAP baseline (DAST) | daily deep-scan + manual dispatch | ~5 min | maintainers | -| testssl.sh | daily deep-scan + manual dispatch | ~3 min | maintainers | -| semgrep react | daily deep-scan + manual dispatch | ~1 min | maintainers | -| `make verify` | every commit (pre-push) | ~1 min | every developer | -| ci.yml fast gates | every push/PR | ~3 min | every developer | - -Re-run any of the deep-scan tools locally when: - -- A CI receipt surfaces an unexpected finding and you want to bisect against - a local change before pushing. -- You're cutting a release tag and want belt-and-suspenders evidence beyond - the most recent scheduled scan. -- You're adding a new feature in the relevant surface (crypto code → - re-run mutation testing; new HTTP handler → re-run schemathesis + ZAP; - new TLS-config knob → re-run testssl). - -## Related docs - -- [`docs/operator/security.md`](../operator/security.md) — security posture, per-finding closure log. -- [`docs/testing-guide.md`](../testing-guide.md) — manual end-to-end QA playbook. -- [`.github/workflows/ci.yml`](../.github/workflows/ci.yml) — per-PR fast gates. -- [`.github/workflows/security-deep-scan.yml`](../.github/workflows/security-deep-scan.yml) — daily deep-scan gates. -- [`scripts/install-security-tools.sh`](../scripts/install-security-tools.sh) — Go-host-installed tools (the docker-based tools are not in this script). diff --git a/docs/operator/performance-baselines.md b/docs/operator/performance-baselines.md index 72267d2..a45687a 100644 --- a/docs/operator/performance-baselines.md +++ b/docs/operator/performance-baselines.md @@ -101,6 +101,5 @@ Capture timing in your own loadtest-baselines log so future regressions surface ## Related docs -- [`docs/contributor/ci-pipeline.md`](../contributor/ci-pipeline.md) — CI guard for performance regression - [`docs/operator/security.md`](security.md) — rate limit tuning - [`docs/reference/architecture.md`](../reference/architecture.md) — request path through handler → service → repository diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 8c66ad3..b463161 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -153,4 +153,4 @@ The `--wait` flag blocks until the job reaches a terminal state (Completed / Fai - [`docs/reference/api.md`](api.md) — the OpenAPI 3.1 spec the CLI wraps - [`docs/reference/mcp.md`](mcp.md) — the MCP server that exposes the same surface to AI assistants -- [`docs/contributor/qa-prerequisites.md`](../contributor/qa-prerequisites.md) — local environment setup before the CLI can talk to a server +- [`docs/getting-started/quickstart.md`](../getting-started/quickstart.md) — local environment setup before the CLI can talk to a server diff --git a/scripts/ci-guards/G-3-env-docs-drift.sh b/scripts/ci-guards/G-3-env-docs-drift.sh index 2d64b21..9097276 100755 --- a/scripts/ci-guards/G-3-env-docs-drift.sh +++ b/scripts/ci-guards/G-3-env-docs-drift.sh @@ -24,14 +24,19 @@ # cat-g-* for closure rationale. set -e -# Defined: config.go + agent + cli + mcp-server + server cmds + test fixtures + ACME DNS export +# Defined: any CERTCTL_* env-var name appearing in production Go sources +# (cmd/ + internal/, excluding *_test.go) plus the ACME DNS-01 script- +# export surface. Test files use `t.Setenv` on env-var names that aren't +# necessarily operator config; harness-only names should not flag. { - grep -nE '"CERTCTL_[A-Z_]+"' internal/config/config.go | sed -E 's/.*"(CERTCTL_[A-Z_]+)".*/\1/' - grep -rhoE '"CERTCTL_[A-Z_]+"' cmd/agent/*.go cmd/cli/*.go cmd/mcp-server/*.go cmd/server/*.go 2>/dev/null | sed -E 's/"(CERTCTL_[A-Z_]+)"/\1/' - grep -rhoE 'CERTCTL_[A-Z_]+' deploy/test/qa_test.go internal/connector/issuer/acme/dns.go 2>/dev/null + grep -rhoE '"CERTCTL_[A-Z_]+"' --include='*.go' --exclude='*_test.go' cmd/ internal/ 2>/dev/null | sed -E 's/"(CERTCTL_[A-Z_]+)"/\1/' + grep -rhoE 'CERTCTL_[A-Z_]+' internal/connector/issuer/acme/dns.go 2>/dev/null } | grep -E '^CERTCTL_' | sort -u > /tmp/g3-defined.txt -# Documented: README + docs + helm -grep -rhoE '\bCERTCTL_[A-Z_]+\b' README.md docs/ deploy/helm/ 2>/dev/null | sort -u > /tmp/g3-docs.txt +# Documented: README + docs + helm + deploy/ENVIRONMENTS.md. +# (ENVIRONMENTS.md is the canonical env-var inventory; the rest of +# deploy/ contains compose/test fixtures whose env-var mentions are +# implementation noise, not operator documentation.) +grep -rhoE '\bCERTCTL_[A-Z_]+\b' README.md docs/ deploy/helm/ deploy/ENVIRONMENTS.md 2>/dev/null | sort -u > /tmp/g3-docs.txt # Allowlist of env vars documented as external integration contracts. # Each entry justifies itself in one line; if you add to this list, # add the justification. @@ -59,6 +64,8 @@ CERTCTL_AUDIT_EXCLUDE_PATHS| CERTCTL_TLS_| CERTCTL_TLS_INSECURE_SKIP_VERIFY| CERTCTL_SCEP_| +CERTCTL_SCEP_PROFILE_[A-Z_]+| +CERTCTL_EST_PROFILE_[A-Z_]+| CERTCTL_SERVER_CA_BUNDLE_PATH| CERTCTL_SERVER_TLS_INSECURE_SKIP_VERIFY| CERTCTL_QA_[A-Z_]+| @@ -89,7 +96,11 @@ CERTCTL_RATE_LIMIT_ # the documented external contracts here. ALLOWED_FLAT=$(echo "$ALLOWED" | tr -d '\n ') DOCS_ONLY=$(comm -13 /tmp/g3-defined.txt /tmp/g3-docs.txt | grep -vE "$ALLOWED_FLAT" || true) -CONFIG_ONLY=$(comm -23 /tmp/g3-defined.txt /tmp/g3-docs.txt || true) +# Apply the same allowlist to the CONFIG_ONLY direction so dynamic +# per-profile dispatch surfaces (CERTCTL_SCEP_PROFILE__*, etc.) +# aren't flagged as "defined but never documented" — they can't all +# be enumerated in a static doc. +CONFIG_ONLY=$(comm -23 /tmp/g3-defined.txt /tmp/g3-docs.txt | grep -vE "$ALLOWED_FLAT" || true) if [ -n "$DOCS_ONLY" ]; then echo "::error::G-3 regression: env var(s) mentioned in docs but not defined in Go source AND not in the documented integration-surface allowlist:" echo "$DOCS_ONLY" diff --git a/scripts/ci-guards/README.md b/scripts/ci-guards/README.md index 0494a79..0a25aa3 100644 --- a/scripts/ci-guards/README.md +++ b/scripts/ci-guards/README.md @@ -41,8 +41,6 @@ Current helpers: `PR_NUMBER` + `GH_TOKEN` env from the go-build-and-test job - `scripts/check-coverage-thresholds.sh` — consumes `coverage.out` + `.github/coverage-thresholds.yml` -- `scripts/qa-doc-part-count.sh` + `scripts/qa-doc-seed-count.sh` — - invoked via `make verify-docs` pre-tag, not in CI ## Adding a new guard @@ -97,12 +95,6 @@ The cold-DB compose smoke (post-v2.1.0 / item-6) is NOT a script in this directo The fourth Bundle artifact (`internal/ciparity/`) is Go tests, not shell guards — runs under the standard Go test step. Pins the MCP tool catalogue floor + naming convention; reports CLI/MCP/OpenAPI surface counts as a trend metric. -## Guards explicitly NOT here - -- **`QA-doc Part-count drift`** + **`QA-doc seed-count drift`** — these - protect docs-the-operator-reads, not anything the product depends on. - Moved to `make verify-docs` (operator runs pre-tag, not on every push). - See the ci-pipeline-cleanup spec, Phase 11. ## Running the full set locally diff --git a/scripts/ci-guards/S-1-hardcoded-source-counts.sh b/scripts/ci-guards/S-1-hardcoded-source-counts.sh index efd18fe..9416d4d 100755 --- a/scripts/ci-guards/S-1-hardcoded-source-counts.sh +++ b/scripts/ci-guards/S-1-hardcoded-source-counts.sh @@ -14,8 +14,7 @@ # # Allowed surfaces: demo-fixture prose in README ("32 # certificates" — those are seed_demo.sql facts, not live -# source counts), historical-milestone counts in -# WORKSPACE-CHANGELOG.md, the testing-guide example phrasing +# source counts), the testing-guide example phrasing # ("README claims 8 issuer connectors but only 6 exist"), # and any number that quotes the source command immediately # adjacent. @@ -27,7 +26,7 @@ set -e BAD=$(grep -rnE '\b[0-9]+\s+(issuer connectors?|target connectors?|notifier connectors?|discovery connectors?|MCP tools|OpenAPI operations|migrations|database tables|frontend pages|HTTP routes)\b' \ README.md docs/ 2>/dev/null \ - | grep -vE 'WORKSPACE-CHANGELOG|seed_demo|demo override' \ + | grep -vE 'seed_demo|demo override' \ | grep -vE 'DRIFT HAZARD|Source: |Rebuild|rebuild via|grep -|wc -l|ls -d|find ' \ | grep -vE 'README claims [0-9]+ issuer connectors but only [0-9]+ exist' \ || true) diff --git a/scripts/qa-doc-seed-count.sh b/scripts/qa-doc-seed-count.sh deleted file mode 100755 index 245ca07..0000000 --- a/scripts/qa-doc-seed-count.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -# scripts/qa-doc-seed-count.sh -# -# Bundle P / Strengthening #6 — QA-doc seed-count drift guard. -# Forces every PR that adds a seed row to migrations/seed_demo.sql -# to keep docs/contributor/qa-test-suite.md::Seed Data Reference in sync. -# -# Per ci-pipeline-cleanup bundle Phase 11 / frozen decision 0.13: -# moved out of CI (was in ci.yml) — operator runs via 'make verify-docs' -# pre-tag. - -set -e -# Seed-cert count: agnostic to documented header format. The current -# documented count lives in `### Certificates (32 total in ...` — -# extract the first integer in that header. -DOC_CERTS=$(grep -oE '### Certificates \([0-9]+' docs/contributor/qa-test-suite.md | grep -oE '[0-9]+' | head -1) -# Authoritative count: unique mc-* IDs in seed_demo.sql. -SEED_CERTS=$(grep -oE 'mc-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') -if [ -z "$DOC_CERTS" ]; then - echo "::warning::Could not extract documented cert count from docs/contributor/qa-test-suite.md." - echo " Skipping cert-count drift check (header format may have changed)." -elif [ "$DOC_CERTS" != "$SEED_CERTS" ]; then - echo "::error::DRIFT — qa-test-suite.md says $DOC_CERTS certs; seed_demo.sql has $SEED_CERTS unique mc-* IDs." - echo " Update docs/contributor/qa-test-suite.md::Seed Data Reference to match." - exit 1 -fi -# Issuers: seed-table count vs doc claim. -DOC_ISS=$(grep -oE '### Issuers \([0-9]+' docs/contributor/qa-test-suite.md | grep -oE '[0-9]+' | head -1) -# Authoritative: unique iss-* IDs (close enough proxy; the issuers -# table count IS the unique-ID count for this prefix). -SEED_ISS=$(grep -oE 'iss-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') -if [ -z "$DOC_ISS" ]; then - echo "::warning::Could not extract documented issuer count." -elif [ "$DOC_ISS" != "$SEED_ISS" ] && [ "$((SEED_ISS - DOC_ISS))" -gt 5 ]; then - # Allow up to 5pp slack — iss-* IDs appear in audit_events and - # other reference tables that aren't issuer-table rows. Drift - # only flags when the spread grows large. - echo "::error::DRIFT — qa-test-suite.md says $DOC_ISS issuers; seed_demo.sql has $SEED_ISS unique iss-* IDs (spread > 5)." - exit 1 -fi -echo "qa-doc-seed-count: clean."