From 3275f9f1e0326b82df21b80b9753bb5d15062272 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 5 May 2026 04:56:26 +0000 Subject: [PATCH] ci: post-Phase-2-docs-overhaul cleanup of stale guards + missing config doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI run on the ecb8896 push surfaced two real failures rooted in the 2026-05-04 docs overhaul: 1. G-3 env-docs-drift caught two phantom CERTCTL_* env vars I'd introduced in the Phase 4 follow-on connector pages (CERTCTL_CA_CERT_PATH_NEW in adcs.md was a placeholder I made up; CERTCTL_EJBCA_POLL_MAX_WAIT_SECONDS in ejbca.md does not exist in source). Both removed. 2. QA-doc Part-count drift guard tried to grep docs/qa-test-guide.md and docs/testing-guide.md, both of which were renamed/deleted in Phase 2/Phase 5. The Part-count drift class died with testing-guide.md (Phase 5 prune dispersed its content); the seed-count drift class is still live but pointed at the wrong path. Fixes: - Removed the QA-doc Part-count drift guard from ci.yml (premise dead) plus its standalone scripts/qa-doc-part-count.sh peer. - Retargeted the QA-doc seed-count drift guard from docs/qa-test-guide.md → docs/contributor/qa-test-suite.md (the Phase 2 target). Updated both ci.yml inline copy and scripts/qa-doc-seed-count.sh. - Updated Makefile qa-stats: target to drop the testing-guide.md Parts metric (file is gone). - Updated Makefile verify-docs: target to drop the part-count step. G-3 was also failing in the second direction (env vars defined in config.go but never documented anywhere). 16 vars surfaced — features.md (deleted Phase 6) and testing-guide.md (deleted Phase 5) had been their canonical home. Created docs/reference/configuration.md as the new home: a compact operator-facing env-var reference covering scheduler intervals, job lifecycle, rate limiting, audit, deploy verify, database, agent-side, and SCEP profile binding. Added to docs/README.md Reference table. Doc-side updates to qa-test-suite.md to reframe its references to the deleted testing-guide.md (it's now self-contained: the Part-by-Part Coverage Map IS the canonical Part inventory). Cosmetic comment-only updates in ci.yml + scripts/ci-guards/*.sh + scripts/dev-setup.sh to point at the new audience-organized doc paths (docs/operator/security.md, docs/operator/tls.md, docs/reference/architecture.md, etc.) instead of the pre-Phase-2 flat layout. Verified: all 24 ci-guards/*.sh pass locally; qa-doc-seed-count.sh clean. Net diff: 178 additions / 112 deletions across 13 files. One file deleted (qa-doc-part-count.sh) and one file added (docs/reference/configuration.md). --- .github/workflows/ci.yml | 54 +++++----- Makefile | 29 +++--- docs/README.md | 1 + docs/contributor/qa-test-suite.md | 33 +++---- docs/reference/configuration.md | 98 +++++++++++++++++++ docs/reference/connectors/adcs.md | 7 +- docs/reference/connectors/ejbca.md | 7 +- scripts/ci-guards/G-3-env-docs-drift.sh | 2 +- scripts/ci-guards/H-009-readme-jwt.sh | 4 +- .../ci-guards/L-001-insecure-skip-verify.sh | 6 +- .../ci-guards/U-2-plaintext-healthcheck.sh | 2 +- scripts/dev-setup.sh | 6 +- scripts/qa-doc-part-count.sh | 27 ----- scripts/qa-doc-seed-count.sh | 14 +-- 14 files changed, 178 insertions(+), 112 deletions(-) create mode 100644 docs/reference/configuration.md mode change 100755 => 100644 scripts/ci-guards/G-3-env-docs-drift.sh mode change 100755 => 100644 scripts/ci-guards/H-009-readme-jwt.sh mode change 100755 => 100644 scripts/ci-guards/L-001-insecure-skip-verify.sh mode change 100755 => 100644 scripts/ci-guards/U-2-plaintext-healthcheck.sh mode change 100755 => 100644 scripts/dev-setup.sh delete mode 100755 scripts/qa-doc-part-count.sh mode change 100755 => 100644 scripts/qa-doc-seed-count.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8e47b73..7f16aca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: # does call, this step fails the build until either upstream # ships a fix OR we cut the dep. Deferred-call advisories that # legitimately can't be remediated yet should be added to the - # NIST SSDF deviation log in docs/security.md, not silenced here. + # NIST SSDF deviation log in docs/operator/security.md, not silenced here. run: govulncheck ./... - name: Install staticcheck (Bundle-7 / D-001) @@ -135,48 +135,38 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} run: bash scripts/coverage-pr-comment.sh - # Bundle P / Strengthening #6 — QA-doc drift guards. Forces every PR - # that adds a Part to docs/testing-guide.md OR a seed row to - # migrations/seed_demo.sql to keep docs/qa-test-guide.md in sync. This - # eliminates the doc-drift class structurally — the symptom Bundle I - # had to clean up by hand becomes a CI-time error going forward. - - name: QA-doc Part-count drift guard - run: | - set -e - DOC_PARTS=$(grep -oE '49 of [0-9]+ Parts' docs/qa-test-guide.md | grep -oE '[0-9]+' | tail -1) - GUIDE_PARTS=$(grep -cE '^## Part [0-9]+:' docs/testing-guide.md) - if [ -z "$DOC_PARTS" ]; then - echo "::error::Could not extract Part count from docs/qa-test-guide.md headline." - echo " Expected pattern: '49 of Parts'" - exit 1 - fi - if [ "$DOC_PARTS" != "$GUIDE_PARTS" ]; then - echo "::error::DRIFT — qa-test-guide.md headline claims $DOC_PARTS Parts; testing-guide.md has $GUIDE_PARTS Parts." - echo " Update docs/qa-test-guide.md to match. Bundle I patched this once;" - echo " Bundle P added this guard so the drift cannot recur silently." - exit 1 - fi - echo "QA-doc Part-count drift guard: clean ($DOC_PARTS == $GUIDE_PARTS)." - + # Bundle P / Strengthening #6 — QA-doc seed-count drift guard. Forces + # every PR that adds a seed row to migrations/seed_demo.sql to keep + # docs/contributor/qa-test-suite.md::Seed Data Reference in sync. + # + # Phase 5 of the 2026-05-04 docs overhaul (commit c64777f) deleted + # docs/testing-guide.md (its content dispersed across the new + # audience-organized doc tree); the previous QA-doc Part-count drift + # guard tracked Part counts between testing-guide.md and the old + # qa-test-guide.md headline. With testing-guide.md gone, that guard's + # premise is dead and it has been removed. The seed-count drift class + # is still live: qa-test-suite.md::Seed Data Reference enumerates + # certs/issuers and seed_demo.sql is the source of truth. - name: QA-doc seed-count drift guard run: | set -e + DOC=docs/contributor/qa-test-suite.md # Seed-cert count: agnostic to documented header format. The current # documented count lives in `### Certificates (32 total in ...` — # extract the first integer in that header. - DOC_CERTS=$(grep -oE '### Certificates \([0-9]+' docs/qa-test-guide.md | grep -oE '[0-9]+' | head -1) + DOC_CERTS=$(grep -oE '### Certificates \([0-9]+' "$DOC" | grep -oE '[0-9]+' | head -1) # Authoritative count: unique mc-* IDs in seed_demo.sql. SEED_CERTS=$(grep -oE 'mc-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') if [ -z "$DOC_CERTS" ]; then - echo "::warning::Could not extract documented cert count from docs/qa-test-guide.md." + echo "::warning::Could not extract documented cert count from $DOC." echo " Skipping cert-count drift check (header format may have changed)." elif [ "$DOC_CERTS" != "$SEED_CERTS" ]; then - echo "::error::DRIFT — qa-test-guide.md says $DOC_CERTS certs; seed_demo.sql has $SEED_CERTS unique mc-* IDs." - echo " Update docs/qa-test-guide.md::Seed Data Reference to match." + echo "::error::DRIFT — $DOC says $DOC_CERTS certs; seed_demo.sql has $SEED_CERTS unique mc-* IDs." + echo " Update $DOC::Seed Data Reference to match." exit 1 fi # Issuers: seed-table count vs doc claim. - DOC_ISS=$(grep -oE '### Issuers \([0-9]+' docs/qa-test-guide.md | grep -oE '[0-9]+' | head -1) + DOC_ISS=$(grep -oE '### Issuers \([0-9]+' "$DOC" | grep -oE '[0-9]+' | head -1) # Authoritative: unique iss-* IDs (close enough proxy; the issuers # table count IS the unique-ID count for this prefix). SEED_ISS=$(grep -oE 'iss-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') @@ -186,7 +176,7 @@ jobs: # Allow up to 5pp slack — iss-* IDs appear in audit_events and # other reference tables that aren't issuer-table rows. Drift # only flags when the spread grows large. - echo "::error::DRIFT — qa-test-guide.md says $DOC_ISS issuers; seed_demo.sql has $SEED_ISS unique iss-* IDs (spread > 5)." + echo "::error::DRIFT — $DOC says $DOC_ISS issuers; seed_demo.sql has $SEED_ISS unique iss-* IDs (spread > 5)." exit 1 fi echo "QA-doc seed-count drift guard: clean." @@ -209,7 +199,7 @@ jobs: # 167 legitimate tests for no observable behavior change. The # Test__ form remains documented as # the recommended pattern for parameterized scenarios in - # docs/qa-test-guide.md, but is not gated. + # docs/contributor/qa-test-suite.md, but is not gated. - name: Regression guards (extracted to scripts/ci-guards/) # All named regression guards live at scripts/ci-guards/.sh per # ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally: @@ -289,7 +279,7 @@ jobs: # HTTPS-Everywhere (v2.0.47): the chart fails render when no TLS source is # configured. Every lint/template invocation below must pick exactly one # provisioning mode — see deploy/helm/certctl/templates/_helpers.tpl - # (certctl.tls.required) and docs/tls.md. + # (certctl.tls.required) and docs/operator/tls.md. - name: Lint Helm Chart run: | helm lint deploy/helm/certctl/ \ diff --git a/Makefile b/Makefile index 885f962..76d9ac0 100644 --- a/Makefile +++ b/Makefile @@ -119,15 +119,18 @@ verify: @echo "" @echo "verify: PASS — safe to commit" -# verify-docs: pre-tag gate. Runs the QA-doc Part-count + seed-count -# drift guards that ci-pipeline-cleanup Phase 11 / frozen decision 0.13 -# moved out of CI (was per-push blocking; now operator-runs pre-tag). -# These guards protect docs/qa-test-guide.md headlines from drifting -# vs the underlying source-of-truth (testing-guide Part count, seed -# row count). Operator-facing docs only — not product-affecting. +# verify-docs: pre-tag gate. Runs the QA-doc seed-count drift guard +# that ci-pipeline-cleanup Phase 11 / frozen decision 0.13 moved out +# of CI (was per-push blocking; now operator-runs pre-tag). Protects +# docs/contributor/qa-test-suite.md::Seed Data Reference from +# drifting vs migrations/seed_demo.sql. Operator-facing docs only — +# not product-affecting. +# +# The QA-doc Part-count drift guard retired in the 2026-05-04 docs +# overhaul Phase 5 when docs/testing-guide.md was pruned (its content +# dispersed across the audience-organized doc tree); the Part-count +# class no longer exists outside the qa_test.go file itself. verify-docs: - @echo "==> QA-doc Part-count drift" - @bash scripts/qa-doc-part-count.sh @echo "==> QA-doc seed-count drift" @bash scripts/qa-doc-seed-count.sh @echo "" @@ -263,9 +266,12 @@ frontend-build: @echo "Frontend build complete" # QA Suite Stats — Bundle P / Strengthening #8. -# Single source-of-truth for every count claim in docs/qa-test-guide.md + -# docs/testing-guide.md. The Strengthening #6 CI drift guards consume the -# same numbers, eliminating the doc-drift class structurally. +# Single source-of-truth for every count claim in +# docs/contributor/qa-test-suite.md. The Strengthening #6 CI drift guards +# (now scoped to the seed-count class only — the Part-count class retired +# in the 2026-05-04 docs overhaul Phase 5 when testing-guide.md was +# pruned) consume the same numbers, eliminating the doc-drift class +# structurally. qa-stats: @echo "=== certctl QA Suite Stats ===" @echo "Date: $$(date +%Y-%m-%d)" @@ -278,7 +284,6 @@ qa-stats: @echo "Fuzz targets: $$(grep -rE 'func Fuzz[A-Z]' --include='*_test.go' . 2>/dev/null | wc -l | tr -d ' ')" @echo "t.Skip sites: $$(grep -rE 't\.Skip(Now|f)?\(' --include='*_test.go' . 2>/dev/null | wc -l | tr -d ' ')" @echo "qa_test.go Part_ subtests: $$(grep -cE 't\.Run\(\"Part[0-9]+_' deploy/test/qa_test.go 2>/dev/null || echo 0)" - @echo "testing-guide.md Parts: $$(grep -cE '^## Part [0-9]+:' docs/testing-guide.md 2>/dev/null || echo 0)" @echo "Seed unique mc-* IDs: $$(grep -oE "mc-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ')" @echo "Seed unique ag-* IDs: $$(grep -oE "ag-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ') (incl. agent_groups; agents-table count is 12)" @echo "Seed unique iss-* IDs: $$(grep -oE "iss-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ') (issuers table count is 13)" diff --git a/docs/README.md b/docs/README.md index c77f89a..f8101d9 100644 --- a/docs/README.md +++ b/docs/README.md @@ -29,6 +29,7 @@ You're operating certctl in production or building integrations and need authori | [Architecture](reference/architecture.md) | System design, data flow, security model, deployment topologies | | [API](reference/api.md) | OpenAPI 3.1 spec, integration patterns, client SDK generation | | [CLI](reference/cli.md) | certctl-cli command reference and CI/CD integration patterns | +| [Configuration](reference/configuration.md) | `CERTCTL_*` environment variable reference (scheduler, rate limits, deploy verify, audit, agent) | | [MCP server](reference/mcp.md) | Model Context Protocol integration for AI assistants | | [Release verification](reference/release-verification.md) | Cosign / SLSA / SBOM verification procedure | | [Intermediate CA hierarchy](reference/intermediate-ca-hierarchy.md) | Multi-level CA tree management — RFC 5280 §3.2/§4.2.1.9/§4.2.1.10 enforcement | diff --git a/docs/contributor/qa-test-suite.md b/docs/contributor/qa-test-suite.md index 9ea1ceb..c75dcbc 100644 --- a/docs/contributor/qa-test-suite.md +++ b/docs/contributor/qa-test-suite.md @@ -4,13 +4,13 @@ > **Audience:** Anyone running release QA for certctl — whether you're a first-time contributor or the maintainer cutting a release tag. > -> **Companion to:** `docs/testing-guide.md` (the *what* to test). This document explains the *how* — the automated test file, what it covers, what it skips, and how to fill the gaps manually. +> **Self-contained.** Through 2026-05-04 this doc was a companion to a separate `docs/testing-guide.md` (the *what* to test) — that companion was pruned during the Phase 5 docs overhaul (its content dispersed across the audience-organized doc tree). The Part-by-Part Coverage Map below is now the canonical inventory of QA Parts. --- ## Test Suite Health (regenerate via `make qa-stats`) -> Snapshot at HEAD. Re-run `make qa-stats` to refresh; CI's QA-doc drift guards (`.github/workflows/ci.yml`) catch out-of-date Part / cert / issuer counts on every PR. **Last regenerated: 2026-04-27 (Bundle P).** +> Snapshot at HEAD. Re-run `make qa-stats` to refresh; the QA-doc seed-count drift guard (`.github/workflows/ci.yml::QA-doc seed-count drift guard`) catches out-of-date cert / issuer counts on every PR. The Part-count drift guard retired in the 2026-05-04 docs overhaul Phase 5 (testing-guide.md was pruned; Part counts are now tracked inside `qa_test.go` itself, not against an external doc). **Last regenerated: 2026-04-27 (Bundle P).** | Metric | Value | Target | Status | |---|---|---|---| @@ -20,23 +20,22 @@ | Frontend test files | 38 | n/a | ℹ | | Fuzz targets | 11 | ≥10 (one per hand-rolled parser) | ✓ | | `t.Skip` sites | 60 | each carries valid rationale (Bundle O audit) | ✓ | -| `qa_test.go` Part_* subtests | 53 | tracks `testing-guide.md` Parts (3 `## Part 15-17` covered indirectly via Parts 42–46) | ✓ | -| `testing-guide.md` Parts | 56 | n/a | ℹ | +| `qa_test.go` Part_* subtests | 53 | covers 49 of 56 historical QA Parts directly + Parts 15–17 indirectly via Parts 42–46 | ✓ | | Existential cluster line cov (post-Bundle-J + L.B + Bundle 0.7) | acme 55.6%, stepca 90.4%, local-issuer ≥86%, crypto ≥85% | ≥95% | △ ACME below; tracked in `coverage-matrix.md` | | Mutation kill rate (Existential) | unmeasured (operator-runnable per Strengthening #5) | ≥90% | ⚠ | | Race detector clean (`-count=10`) | partial (`-count=3` clean per Phase 0) | 0 races | ⚠ | ## What Is This File? -`deploy/test/qa_test.go` is a single Go test file (~1700 lines) that automates as much of `docs/testing-guide.md` as possible against a running certctl Docker Compose demo stack. It replaces the legacy `qa-smoke-test.sh` bash script. +`deploy/test/qa_test.go` is a single Go test file (~1700 lines) that automates the historical QA Part inventory (preserved in the Part-by-Part Coverage Map below) against a running certctl Docker Compose demo stack. It replaces the legacy `qa-smoke-test.sh` bash script. It covers **49 of 56 Parts** of the testing guide as automation; the remaining 7 are either manual-only by design or pending QA-suite coverage: - **49 `Part_*` automation wrappers**, **~159 leaf subtests** — API calls, database queries, source file checks, performance benchmarks - **11 fully skipped Parts** — with documented reasons (external CAs, Windows, browser-only, etc.) — see "What This Test Does NOT Cover" below -- **4 Parts NOT YET AUTOMATED** — Parts 23 (S/MIME & EKU), 24 (OCSP/CRL), 55 (Agent Soft-Retirement), 56 (Notification Retry & Dead-Letter) — must be tested manually per `docs/testing-guide.md` until QA-suite automation lands -- **Manual-only flows** in addition: GUI flows, scheduler timing, Docker log inspection — must be done by a human following `docs/testing-guide.md` +- **4 Parts NOT YET AUTOMATED** — Parts 23 (S/MIME & EKU), 24 (OCSP/CRL), 55 (Agent Soft-Retirement), 56 (Notification Retry & Dead-Letter) — must be tested manually until QA-suite automation lands; the Part-by-Part Coverage Map below describes the surface area each Part covers +- **Manual-only flows** in addition: GUI flows, scheduler timing, Docker log inspection — must be done by a human (Coverage Map below describes each) ## Architecture @@ -149,8 +148,8 @@ This table shows what each Part tests and what's left for manual verification. | 20 | Post-Deployment Verification | 1 | 404 on nonexistent job verification | TLS probing, fingerprint comparison | | 21 | EST Server | 2 | CACerts (200 + content-type), CSRAttrs (200/204) | simpleenroll with CSR, simplereenroll, PKCS#7 parsing | | 22 | Certificate Export | 3 | PEM export, PKCS#12 export, 404 on nonexistent | Download mode, file content validation | -| 23 | S/MIME & EKU Support | 0 (NOT AUTOMATED) | — | S/MIME profile creation; EKU enforcement on issuance; SMIMECapabilities extension presence in issued cert; rejection of profile-violating EKU on CSR. Test manually per `docs/testing-guide.md::Part 23` | -| 24 | OCSP Responder & DER CRL | 0 (NOT AUTOMATED) | — | OCSP request/response (RFC 6960), DER CRL generation, status (Good/Revoked/Unknown), Must-Staple coordination. Test manually per `docs/testing-guide.md::Part 24` | +| 23 | S/MIME & EKU Support | 0 (NOT AUTOMATED) | — | S/MIME profile creation; EKU enforcement on issuance; SMIMECapabilities extension presence in issued cert; rejection of profile-violating EKU on CSR. Test manually — see the Coverage Map row | +| 24 | OCSP Responder & DER CRL | 0 (NOT AUTOMATED) | — | OCSP request/response (RFC 6960), DER CRL generation, status (Good/Revoked/Unknown), Must-Staple coordination. Test manually — see the Coverage Map row | | 25 | Certificate Discovery | 5 | List discovered, summary, list scan targets, create target, invalid CIDR 400 | Agent filesystem scan, claim/dismiss workflow | | 26 | Enhanced Query API | 4 | Sort descending, cursor pagination, time-range filter, invalid sort field | Field projection correctness, cursor token cycling | | 27 | Request Body Size Limits | 1 | 2MB body rejected (413/400) | Exact limit boundary (1MB) | @@ -180,12 +179,12 @@ This table shows what each Part tests and what's left for manual verification. | 52 | Helm Chart | 5 | Chart.yaml, values.yaml, 4 templates exist, securityContext, health probes | `helm template` rendering, `helm install` | | 53 | Kubernetes Secrets Target Connector (M47) | 18 | Config validation (namespace DNS-1123, secret name DNS subdomain, label keys, required fields), deployment (create/update Secret, chain concatenation, error propagation), validation (serial comparison, not-found, empty cert) | GUI target wizard KubernetesSecrets fields (namespace, secret_name, labels, kubeconfig_path), Helm RBAC toggle, TargetDetailPage type label | | 54 | AWS ACM Private CA Issuer Connector (M47) | 23 | Config validation (region, CA ARN regex, signing algorithm whitelist, validity_days, defaults), issuance (full flow, empty CSR, errors), renewal (reuses issuance), revocation (reason mapping, default, errors), GetOrderStatus completed, GetCACertPEM (success/chain/error), GetRenewalInfo nil | GUI issuer wizard AWSACMPCA fields (region, ca_arn, signing_algorithm, validity_days, template_arn), seed data visibility, create issuer flow | -| 55 | Agent Soft-Retirement (I-004) | 0 (NOT AUTOMATED) | — | Soft-retire vs hard-retire; force flag; reason capture; foreign-key cascade behavior on retired-agent cert ownership; reactivation. Test manually per `docs/testing-guide.md::Part 55` | -| 56 | Notification Retry & Dead-Letter Queue (I-005) | 0 (NOT AUTOMATED) | — | Retry loop with exponential backoff, dead-letter transition after N retries, requeue endpoint (`POST /api/v1/notifications/{id}/requeue`), idempotency on retry. Test manually per `docs/testing-guide.md::Part 56` | +| 55 | Agent Soft-Retirement (I-004) | 0 (NOT AUTOMATED) | — | Soft-retire vs hard-retire; force flag; reason capture; foreign-key cascade behavior on retired-agent cert ownership; reactivation. Test manually — see the Coverage Map row | +| 56 | Notification Retry & Dead-Letter Queue (I-005) | 0 (NOT AUTOMATED) | — | Retry loop with exponential backoff, dead-letter transition after N retries, requeue endpoint (`POST /api/v1/notifications/{id}/requeue`), idempotency on retry. Test manually — see the Coverage Map row | **Totals (verified 2026-04-27):** 49 `Part_*` automation wrappers, ~159 leaf subtests, 11 fully skipped Parts, 4 Parts not yet automated (23, 24, 55, 56), and an unspecified count of manual-only -flows (GUI, scheduler timing, Docker log inspection). Run `grep -cE '^## Part [0-9]+:' docs/testing-guide.md` +flows (GUI, scheduler timing, Docker log inspection). Run `grep -cE 't\.Run\("Part[0-9]+_' deploy/test/qa_test.go` to count Part_* automation wrappers and `grep -cE 't\.Run\("Part[0-9]+_' deploy/test/qa_test.go` to re-verify. ## Coverage by Risk Class @@ -194,14 +193,14 @@ A buyer's QA lead reading this doc wants "where are the existential bugs caught? | Risk class | Description | Parts in scope | Automation status | |---|---|---|---| -| **Existential** (Critical paths — bugs would compromise CA, leak keys, mis-issue, bypass revocation) | Crypto, PKCS#7, local-issuer, OCSP/CRL, agent keygen, CSR validation | 5 (Revocation), 21 (EST), 23 (S/MIME EKU), 24 (OCSP/CRL), 47 (Digest with cert content), 53 (K8s Secrets), 54 (AWS PCA) | 5/7 automated; Parts 23 + 24 pending (Bundle I Skip stubs in `qa_test.go`; manual playbook in `testing-guide.md`) | +| **Existential** (Critical paths — bugs would compromise CA, leak keys, mis-issue, bypass revocation) | Crypto, PKCS#7, local-issuer, OCSP/CRL, agent keygen, CSR validation | 5 (Revocation), 21 (EST), 23 (S/MIME EKU), 24 (OCSP/CRL), 47 (Digest with cert content), 53 (K8s Secrets), 54 (AWS PCA) | 5/7 automated; Parts 23 + 24 pending (Bundle I Skip stubs in `qa_test.go`; manual playbook in the Coverage Map below) | | **High** (FSM corruption, credential leak, authn/z weakening) | Renewal, jobs, agents, issuers, deployment, scheduler | 4, 7, 8, 9, 18, 19, 20, 22, 25, 28, 29, 32, 33, 48, 49, 55, 56 | 14/17 automated; CLI / MCP / scheduler-loop are inherently SKIP (require compiled binaries / Docker logs); Parts 55 + 56 pending | | **Medium** (Operational pain or silent data drift) | Targets, notifiers, observability, error handling, performance, regression | 14, 15-17, 30, 31, 38, 39, 40, 41, 42, 43, 44, 45, 46 | 14/14 automated (15-17 indirect via Parts 42–46) | | **Low** (Hygiene) | Documentation, docs verification | 40 (Documentation), 50 (Onboarding) | 2/2 automated | | **Frontend** (XSS, render correctness, mutation contracts) | GUI testing | 35, 36-37 | 0/3 automated in this suite (Vitest covers separately under `web/`); this doc punts to manual + Vitest | | **Compliance** (PCI / SOC2 / HIPAA-relevant) | Audit trail, body-size limits, request limits, Helm chart deploy posture | 27, 32, 51, 52 | 4/4 automated | -This is the table acquisition reviewers screenshot for their report. When a new Part lands in `testing-guide.md`, classify it here; the QA-doc Part-count drift guard (`.github/workflows/ci.yml::QA-doc Part-count drift guard`) catches the count mismatch. +This is the table acquisition reviewers screenshot for their report. When a new Part_* subtest lands in `qa_test.go`, classify it here. ## Test Categories @@ -233,11 +232,11 @@ Timed API requests with threshold assertions: ## What This Test Does NOT Cover -These gaps must be filled by manual testing per `docs/testing-guide.md`: +These gaps must be filled by manual testing — see each Coverage Map row for surface-area description: ### Not Yet Automated (Parts 23, 24, 55, 56) -These Parts are documented in `docs/testing-guide.md` but have no `Part_*` automation +These historical QA Parts are listed in the Coverage Map below but have no `Part_*` automation in `qa_test.go` yet. They are operator-runnable from the manual playbook; QA-suite automation should land before the next acquisition-grade release. @@ -431,7 +430,7 @@ grep -oE 'mutation score is [0-9.]+' tool-output/mutation-crypto.txt | tail -1 When a new feature ships: -1. **Add a Part section** in `qa_test.go` following the numbering in `docs/testing-guide.md` +1. **Add a Part section** in `qa_test.go` following the numbering convention in the Coverage Map below 2. **API tests**: use `c.get()`, `c.post()`, `c.bodyStr()`, `c.getJSON()`, `c.timedGet()` 3. **Source checks**: use `fileExists(t, "relative/path")` and `fileContains(t, "path", "substring")` 4. **DB checks**: use `openQADB(t)` and `db.queryInt(t, "SELECT ...")` diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md new file mode 100644 index 0000000..bf0c2f6 --- /dev/null +++ b/docs/reference/configuration.md @@ -0,0 +1,98 @@ +# Configuration Reference + +> Last reviewed: 2026-05-05 + +Compact reference for `CERTCTL_*` environment variables consumed by +`certctl-server` and `certctl-agent`. Most operators don't need to +touch these — defaults are tuned for the common case. Reach for them +when the system's behaviour needs tuning beyond what's exposed in the +GUI / API. + +This page enumerates the operator-tunable knobs that don't have a +dedicated home elsewhere. Connector-specific env vars are documented +on the per-connector pages under +[`docs/reference/connectors/`](connectors/index.md). Protocol env +vars (ACME server, EST, SCEP) are documented under +[`docs/reference/protocols/`](protocols/). TLS env vars are +documented in [`docs/operator/tls.md`](../operator/tls.md). + +## Scheduler intervals + +The scheduler runs N background loops; intervals are tunable for +performance / contention tuning. + +| Variable | Default | Description | +|---|---|---| +| `CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL` | `2m` | How often the agent-health loop scans for stale heartbeats and transitions agents to `Unhealthy` / `Offline`. | +| `CERTCTL_SCHEDULER_JOB_PROCESSOR_INTERVAL` | `30s` | How often the job-processor loop dispatches `Pending` jobs to agents. | +| `CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL` | `1m` | How often the notification-dispatcher loop fans out queued alerts to channels. | +| `CERTCTL_SHORT_LIVED_EXPIRY_CHECK_INTERVAL` | `5m` | How often the short-lived-expiry loop watches certs whose TTL is less than 1h for imminent expiry. | + +For the full scheduler topology (12 loops, 8 always-on + 4 opt-in) +see [`architecture.md`](architecture.md) "Scheduler topology". + +## Job lifecycle + +| Variable | Default | Description | +|---|---|---| +| `CERTCTL_JOB_AWAITING_CSR_TIMEOUT` | `24h` | How long a job stays in `AwaitingCSR` before the scheduler marks it `Failed` (the agent never picked it up). | + +## Rate limiting + +The control plane API is rate-limited by default; tune for +high-volume environments (mass-rotation events, bulk imports). + +| Variable | Default | Description | +|---|---|---| +| `CERTCTL_RATE_LIMIT_ENABLED` | `true` | Master toggle. Disable only for trusted-network single-tenant deploys where the API is firewall-protected. | +| `CERTCTL_RATE_LIMIT_PER_USER_RPS` | `0` (= use global default) | Per-user requests-per-second cap. Zero opts each user into the global default in `internal/api/middleware`. | +| `CERTCTL_RATE_LIMIT_PER_USER_BURST` | `0` (= use global default) | Per-user token-bucket burst size. Same opt-in semantics. | + +## Audit trail + +| Variable | Default | Description | +|---|---|---| +| `CERTCTL_AUDIT_FLUSH_TIMEOUT_SECONDS` | `30` | How long the audit-event flush worker waits for the buffered batch to drain before forcing a flush at shutdown. | + +## Deploy verification + +The deploy-hardening primitive wraps every cert deploy in +atomic-write + post-verify + rollback. These env vars tune the +post-deploy TLS verification phase. + +| Variable | Default | Description | +|---|---|---| +| `CERTCTL_VERIFY_DEPLOYMENT` | `true` | Master toggle for post-deploy TLS verify. Disable only for connectors / environments where the verify endpoint is not reachable from the agent. | +| `CERTCTL_VERIFY_DELAY` | `2s` | How long to wait after the reload command completes before the first verify-handshake attempt (gives the daemon time to pick up new keys). | +| `CERTCTL_VERIFY_TIMEOUT` | `10s` | Per-attempt TLS-handshake timeout. | +| `CERTCTL_DEPLOY_BACKUP_RETENTION` | `3` | How many `.certctl-bak..` rollback snapshots to keep per target after a successful deploy. `0` uses the default of 3; `-1` opts out of pruning entirely. | + +For the full deploy contract see +[`deployment-model.md`](deployment-model.md). + +## Database + +| Variable | Default | Description | +|---|---|---| +| `CERTCTL_DATABASE_MIGRATIONS_PATH` | `./migrations` | Filesystem path to the `*.up.sql` / `*.down.sql` migration set. Override only when running `certctl-server` from a non-standard layout. | + +## Agent + +| Variable | Default | Description | +|---|---|---| +| `CERTCTL_AGENT_ID` | (none — required) | The agent's unique ID, issued by `POST /api/v1/agents/register` and bundled into the agent's registration response. Pass via this env var when the agent runs as a systemd unit / container without the `-agent-id` CLI flag. | + +## SCEP profile binding (single-profile back-compat) + +| Variable | Default | Description | +|---|---|---| +| `CERTCTL_SCEP_PROFILE_ID` | (empty) | Optional certificate profile ID for the legacy single-profile SCEP path. The multi-profile path uses `CERTCTL_SCEP_PROFILES=` + `CERTCTL_SCEP_PROFILE__PROFILE_ID` instead — see [`scep-server.md`](protocols/scep-server.md). | + +## Related references + +- [`architecture.md`](architecture.md) — scheduler topology, system design, security model +- [`deployment-model.md`](deployment-model.md) — atomic write + verify + rollback contract +- [`operator/security.md`](../operator/security.md) — full security posture (auth, rate limits, encryption at rest) +- [`operator/tls.md`](../operator/tls.md) — control-plane TLS env vars +- Per-connector pages under [`reference/connectors/`](connectors/index.md) for connector-specific config +- Per-protocol pages under [`reference/protocols/`](protocols/) for ACME / SCEP / EST / CRL+OCSP / async-CA polling diff --git a/docs/reference/connectors/adcs.md b/docs/reference/connectors/adcs.md index 303701a..7b5b234 100644 --- a/docs/reference/connectors/adcs.md +++ b/docs/reference/connectors/adcs.md @@ -87,10 +87,11 @@ When the certctl sub-CA cert is approaching expiry: 1. Generate a new keypair (re-keying is recommended at sub-CA rotation time). 2. CSR + ADCS signing cycle as above. -3. Stage the new cert and key at fresh paths - (`CERTCTL_CA_CERT_PATH_NEW` etc.) and follow the +3. Stage the new cert and key at fresh on-disk paths and follow the [intermediate-CA hierarchy - runbook](../intermediate-ca-hierarchy.md) for the cutover. The + runbook](../intermediate-ca-hierarchy.md) for the cutover (rotate + `CERTCTL_CA_CERT_PATH` / `CERTCTL_CA_KEY_PATH` to the new files + when ready). The key concern is overlap: both the old and new sub-CA certs must chain to the ADCS root during the rollover so existing leaves keep validating. diff --git a/docs/reference/connectors/ejbca.md b/docs/reference/connectors/ejbca.md index f506336..4f8cfc5 100644 --- a/docs/reference/connectors/ejbca.md +++ b/docs/reference/connectors/ejbca.md @@ -103,11 +103,10 @@ replaces the connector without restart. Prior issuance state ### Diagnosing approval-pending hangs If `GetOrderStatus` consistently times out, the operator approval -queue in EJBCA is the most common cause. Bump -`CERTCTL_EJBCA_POLL_MAX_WAIT_SECONDS` so a single tick can wait -through the full approval window — see +queue in EJBCA is the most common cause. The connector consumes +the shared bounded-polling primitive — see [async-ca-polling.md](../protocols/async-ca-polling.md) for the -schedule shape. +schedule shape and tuning approach. ## Related docs diff --git a/scripts/ci-guards/G-3-env-docs-drift.sh b/scripts/ci-guards/G-3-env-docs-drift.sh old mode 100755 new mode 100644 index 233f371..d95ab9a --- a/scripts/ci-guards/G-3-env-docs-drift.sh +++ b/scripts/ci-guards/G-3-env-docs-drift.sh @@ -101,7 +101,7 @@ if [ -n "$CONFIG_ONLY" ]; then echo "::error::G-3 regression: env var(s) defined in Go source but never documented:" echo "$CONFIG_ONLY" echo "" - echo "Add an entry to docs/features.md (or another canonical doc) so operators can find it." + echo "Add an entry to the canonical config doc (docs/reference/architecture.md or the per-connector pages under docs/reference/connectors/) (or another canonical doc) so operators can find it." exit 1 fi echo "G-3 env-docs-drift: clean." diff --git a/scripts/ci-guards/H-009-readme-jwt.sh b/scripts/ci-guards/H-009-readme-jwt.sh old mode 100755 new mode 100644 index dcea6d3..5ed890f --- a/scripts/ci-guards/H-009-readme-jwt.sh +++ b/scripts/ci-guards/H-009-readme-jwt.sh @@ -4,7 +4,7 @@ # H-009 closed by Bundle D as verified-already-clean: at audit time # the README does NOT advertise JWT support (certctl does not ship # in-process JWT middleware; JWT/OIDC integration is via an -# authenticating gateway, see docs/architecture.md "Authenticating- +# authenticating gateway, see docs/reference/architecture.md "Authenticating- # gateway pattern"). This script grep-fails the build if README ever # re-introduces a sentence advertising JWT as a supported auth mode. # Pattern: "JWT" within ~6 words of "support|auth|enabled|mode" in @@ -20,7 +20,7 @@ if grep -inE 'JWT.{0,40}(support|auth|enabled|mode|provider)' README.md \ echo "::error::H-009 regression: README.md appears to advertise JWT auth support." echo "certctl does NOT ship in-process JWT middleware. JWT/OIDC" echo "integration is via an authenticating gateway — see" - echo "docs/architecture.md::Authenticating-gateway pattern." + echo "docs/reference/architecture.md::Authenticating-gateway pattern." echo "If you added a sentence about JWT to README, either remove" echo "it or rewrite it to point at the gateway pattern." exit 1 diff --git a/scripts/ci-guards/L-001-insecure-skip-verify.sh b/scripts/ci-guards/L-001-insecure-skip-verify.sh old mode 100755 new mode 100644 index 8aab1e1..8652030 --- a/scripts/ci-guards/L-001-insecure-skip-verify.sh +++ b/scripts/ci-guards/L-001-insecure-skip-verify.sh @@ -2,11 +2,11 @@ # scripts/ci-guards/L-001-insecure-skip-verify.sh # # L-001 audited every production InsecureSkipVerify=true call site -# and documented the justification per site in docs/tls.md. This +# and documented the justification per site in docs/operator/tls.md. This # script grep-fails the build if any new `InsecureSkipVerify: true` # lands in a non-test Go file without a `//nolint:gosec` comment # carrying the justification. Test files (_test.go) are exempt. -# Updating the documented surface goes through the docs/tls.md +# Updating the documented surface goes through the docs/operator/tls.md # table — net-new sites must be reasoned about before merge. set -e @@ -32,7 +32,7 @@ if [ -n "$BAD" ]; then echo -e "$BAD" echo "" echo "Add a //nolint:gosec comment with justification on the same" - echo "or preceding line, AND add a row to the docs/tls.md table." + echo "or preceding line, AND add a row to the docs/operator/tls.md table." exit 1 fi echo "L-001 insecure-skip-verify: clean." diff --git a/scripts/ci-guards/U-2-plaintext-healthcheck.sh b/scripts/ci-guards/U-2-plaintext-healthcheck.sh old mode 100755 new mode 100644 index 4bc15a1..18e9cbd --- a/scripts/ci-guards/U-2-plaintext-healthcheck.sh +++ b/scripts/ci-guards/U-2-plaintext-healthcheck.sh @@ -11,7 +11,7 @@ # HEALTHCHECK that targets `http://` against the certctl server # port. # -# Comment lines and the docs/upgrade-to-tls.md:182 expected-to- +# Comment lines and the docs/archive/upgrades/to-tls-v2.2.md:182 expected-to- # fail invariant ("plaintext is gone, expect Connection refused") # are intentionally exempt — we DO want the upgrade-doc string # `http://localhost:8443/health` to remain there, since it diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh old mode 100755 new mode 100644 index 4c38632..f1b3ead --- a/scripts/dev-setup.sh +++ b/scripts/dev-setup.sh @@ -135,7 +135,7 @@ echo " 3. Test the API:" echo " curl --cacert ./deploy/test/certs/ca.crt https://localhost:8443/health" echo "" echo " 4. Try the quick start guide:" -echo " cat docs/quickstart.md" +echo " cat docs/getting-started/quickstart.md" echo "" echo " 5. Access PgAdmin (optional):" echo " make docker-up-dev" @@ -150,6 +150,6 @@ echo " make docker-logs - View service logs" echo "" echo "For more information, see:" echo " • README.md" -echo " • docs/architecture.md" -echo " • docs/quickstart.md" +echo " • docs/reference/architecture.md" +echo " • docs/getting-started/quickstart.md" echo "" diff --git a/scripts/qa-doc-part-count.sh b/scripts/qa-doc-part-count.sh deleted file mode 100755 index b659a8e..0000000 --- a/scripts/qa-doc-part-count.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash -# scripts/qa-doc-part-count.sh -# -# Bundle P / Strengthening #6 — QA-doc Part-count drift guard. -# Forces every PR that adds a Part to docs/testing-guide.md to keep -# docs/qa-test-guide.md headline in sync. -# -# Per ci-pipeline-cleanup bundle Phase 11 / frozen decision 0.13: -# moved out of CI (was in ci.yml) — operator runs via 'make verify-docs' -# pre-tag. Protects docs-the-operator-reads, not anything the product -# depends on; CI-blocking on every push was overkill. - -set -e -DOC_PARTS=$(grep -oE '49 of [0-9]+ Parts' docs/qa-test-guide.md | grep -oE '[0-9]+' | tail -1) -GUIDE_PARTS=$(grep -cE '^## Part [0-9]+:' docs/testing-guide.md) -if [ -z "$DOC_PARTS" ]; then - echo "::error::Could not extract Part count from docs/qa-test-guide.md headline." - echo " Expected pattern: '49 of Parts'" - exit 1 -fi -if [ "$DOC_PARTS" != "$GUIDE_PARTS" ]; then - echo "::error::DRIFT — qa-test-guide.md headline claims $DOC_PARTS Parts; testing-guide.md has $GUIDE_PARTS Parts." - echo " Update docs/qa-test-guide.md to match. Bundle I patched this once;" - echo " Bundle P added this guard so the drift cannot recur silently." - exit 1 -fi -echo "qa-doc-part-count: clean ($DOC_PARTS == $GUIDE_PARTS)." diff --git a/scripts/qa-doc-seed-count.sh b/scripts/qa-doc-seed-count.sh old mode 100755 new mode 100644 index 04d364b..245ca07 --- a/scripts/qa-doc-seed-count.sh +++ b/scripts/qa-doc-seed-count.sh @@ -3,7 +3,7 @@ # # Bundle P / Strengthening #6 — QA-doc seed-count drift guard. # Forces every PR that adds a seed row to migrations/seed_demo.sql -# to keep docs/qa-test-guide.md::Seed Data Reference in sync. +# to keep docs/contributor/qa-test-suite.md::Seed Data Reference in sync. # # Per ci-pipeline-cleanup bundle Phase 11 / frozen decision 0.13: # moved out of CI (was in ci.yml) — operator runs via 'make verify-docs' @@ -13,19 +13,19 @@ set -e # Seed-cert count: agnostic to documented header format. The current # documented count lives in `### Certificates (32 total in ...` — # extract the first integer in that header. -DOC_CERTS=$(grep -oE '### Certificates \([0-9]+' docs/qa-test-guide.md | grep -oE '[0-9]+' | head -1) +DOC_CERTS=$(grep -oE '### Certificates \([0-9]+' docs/contributor/qa-test-suite.md | grep -oE '[0-9]+' | head -1) # Authoritative count: unique mc-* IDs in seed_demo.sql. SEED_CERTS=$(grep -oE 'mc-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') if [ -z "$DOC_CERTS" ]; then - echo "::warning::Could not extract documented cert count from docs/qa-test-guide.md." + echo "::warning::Could not extract documented cert count from docs/contributor/qa-test-suite.md." echo " Skipping cert-count drift check (header format may have changed)." elif [ "$DOC_CERTS" != "$SEED_CERTS" ]; then - echo "::error::DRIFT — qa-test-guide.md says $DOC_CERTS certs; seed_demo.sql has $SEED_CERTS unique mc-* IDs." - echo " Update docs/qa-test-guide.md::Seed Data Reference to match." + echo "::error::DRIFT — qa-test-suite.md says $DOC_CERTS certs; seed_demo.sql has $SEED_CERTS unique mc-* IDs." + echo " Update docs/contributor/qa-test-suite.md::Seed Data Reference to match." exit 1 fi # Issuers: seed-table count vs doc claim. -DOC_ISS=$(grep -oE '### Issuers \([0-9]+' docs/qa-test-guide.md | grep -oE '[0-9]+' | head -1) +DOC_ISS=$(grep -oE '### Issuers \([0-9]+' docs/contributor/qa-test-suite.md | grep -oE '[0-9]+' | head -1) # Authoritative: unique iss-* IDs (close enough proxy; the issuers # table count IS the unique-ID count for this prefix). SEED_ISS=$(grep -oE 'iss-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') @@ -35,7 +35,7 @@ elif [ "$DOC_ISS" != "$SEED_ISS" ] && [ "$((SEED_ISS - DOC_ISS))" -gt 5 ]; then # Allow up to 5pp slack — iss-* IDs appear in audit_events and # other reference tables that aren't issuer-table rows. Drift # only flags when the spread grows large. - echo "::error::DRIFT — qa-test-guide.md says $DOC_ISS issuers; seed_demo.sql has $SEED_ISS unique iss-* IDs (spread > 5)." + echo "::error::DRIFT — qa-test-suite.md says $DOC_ISS issuers; seed_demo.sql has $SEED_ISS unique iss-* IDs (spread > 5)." exit 1 fi echo "qa-doc-seed-count: clean."