name: CI on: push: branches: - master - v2-dev pull_request: branches: - master jobs: go-build-and-test: name: Go Build & Test runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Go uses: actions/setup-go@v5 with: go-version: '1.25.10' - name: Go Build run: | go build ./cmd/server/... go build ./cmd/agent/... go build ./cmd/mcp-server/... go build ./cmd/cli/... - name: gofmt drift (Makefile::verify parity) # ci-pipeline-cleanup Phase 4 / frozen decision 0.13: Makefile::verify # checks gofmt + vet + golangci-lint + go test. CI runs vet, lint, test # already — but NOT gofmt. This step closes the parity gap. # Mirrors the Makefile::verify shape: any gofmt output means the # source needs reformatting. run: | out=$(gofmt -l .) if [ -n "$out" ]; then echo "::error::gofmt would reformat these files (run 'gofmt -w' locally):" echo "$out" exit 1 fi - name: go mod tidy drift # ci-pipeline-cleanup Phase 4: catches PRs that import a package # without committing the go.mod / go.sum update. Standard Go-CI # gate; absent before this bundle. run: | go mod tidy git diff --exit-code go.mod go.sum - name: Go Vet run: go vet ./... - name: Install golangci-lint run: | curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v2.11.4 - name: Run golangci-lint run: golangci-lint run ./... --timeout 5m - name: Install govulncheck run: go install golang.org/x/vuln/cmd/govulncheck@latest - name: Run govulncheck (M-024 hard gate) # Bundle-7 / D-001 partial: govulncheck distinguishes called-vs-uncalled # advisories. Default exit code is non-zero only when YOUR code calls # the vulnerable function — deferred-call advisories show up in the # output but don't fail the gate. # # Bundle F / Audit M-024 (NIST SSDF PW.7.2): the govulncheck step # is now a hard CI gate (no `continue-on-error`). Bundle E's # transitive bumps (x/net 0.42→0.47, x/crypto 0.41→0.45) cleared # the 5 deferred-call advisories that were previously on the # exception list, so the carve-out the original Bundle F prompt # designed is unnecessary — a clean `govulncheck ./...` is the # right gate. If a future advisory lands in a function our code # does call, this step fails the build until either upstream # ships a fix OR we cut the dep. Deferred-call advisories that # legitimately can't be remediated yet should be added to the # NIST SSDF deviation log in docs/operator/security.md, not silenced here. run: govulncheck ./... - name: Install staticcheck (Bundle-7 / D-001) run: go install honnef.co/go/tools/cmd/staticcheck@latest - name: Run staticcheck # Bundle-7 / D-001: Go static analysis additive to vet. Suppressed # rules live in staticcheck.conf with documented justifications; # adding a new entry requires an explicit security review. # # ci-pipeline-cleanup Phase 3 / frozen decision 0.7: HARD gate. # M-028 SA1019 sites verified closed at HEAD 1de61e91: # - middleware.NewAuth: zero callers (all migrated to # NewAuthWithNamedKeys in cmd/server/{main,main_test}.go) # - csr.Attributes (internal/api/handler/scep.go × 2): inline # //lint:ignore SA1019 with load-bearing rationale (RFC 2985 # challengePassword has no non-deprecated stdlib API) # - elliptic.Marshal: only in bundle9_coverage_test.go × 1 as # deliberate byte-equivalence regression oracle, suppressed # with //lint:ignore SA1019 run: staticcheck ./... - name: Race Detection run: go test -race ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/scheduler/... ./internal/connector/... ./internal/crypto/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -timeout 300s - name: Go Test with Coverage run: | go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/api/router/... ./internal/auth/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -cover -coverprofile=coverage.out - name: Check Coverage Thresholds # ci-pipeline-cleanup Phase 2: per-package floors moved to # .github/coverage-thresholds.yml. Each entry has `floor:` + # `why:` (load-bearing context). Logic in # scripts/check-coverage-thresholds.sh — operator runs the same # script locally via `make verify`-equivalent loop. run: bash scripts/check-coverage-thresholds.sh - name: Upload Coverage Report uses: actions/upload-artifact@v4 with: name: go-coverage path: coverage.out retention-days: 30 - name: Coverage PR comment # ci-pipeline-cleanup Phase 10 / frozen decision 0.9: self-hosted # alternative to Codecov / Coveralls. Posts a per-package coverage # delta as a PR comment; updates in place on subsequent pushes. if: github.event_name == 'pull_request' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.number }} GITHUB_REPOSITORY: ${{ github.repository }} run: bash scripts/coverage-pr-comment.sh # Bundle P / Strengthening #6 — QA-doc seed-count drift guard. Forces # every PR that adds a seed row to migrations/seed_demo.sql to keep # docs/contributor/qa-test-suite.md::Seed Data Reference in sync. # # Phase 5 of the 2026-05-04 docs overhaul (commit c64777f) deleted # docs/testing-guide.md (its content dispersed across the new # audience-organized doc tree); the previous QA-doc Part-count drift # guard tracked Part counts between testing-guide.md and the old # qa-test-guide.md headline. With testing-guide.md gone, that guard's # premise is dead and it has been removed. The seed-count drift class # is still live: qa-test-suite.md::Seed Data Reference enumerates # certs/issuers and seed_demo.sql is the source of truth. - name: QA-doc seed-count drift guard run: | set -e DOC=docs/contributor/qa-test-suite.md # Seed-cert count: agnostic to documented header format. The current # documented count lives in `### Certificates (32 total in ...` — # extract the first integer in that header. DOC_CERTS=$(grep -oE '### Certificates \([0-9]+' "$DOC" | grep -oE '[0-9]+' | head -1) # Authoritative count: unique mc-* IDs in seed_demo.sql. SEED_CERTS=$(grep -oE 'mc-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') if [ -z "$DOC_CERTS" ]; then echo "::warning::Could not extract documented cert count from $DOC." echo " Skipping cert-count drift check (header format may have changed)." elif [ "$DOC_CERTS" != "$SEED_CERTS" ]; then echo "::error::DRIFT — $DOC says $DOC_CERTS certs; seed_demo.sql has $SEED_CERTS unique mc-* IDs." echo " Update $DOC::Seed Data Reference to match." exit 1 fi # Issuers: seed-table count vs doc claim. DOC_ISS=$(grep -oE '### Issuers \([0-9]+' "$DOC" | grep -oE '[0-9]+' | head -1) # Authoritative: unique iss-* IDs (close enough proxy; the issuers # table count IS the unique-ID count for this prefix). SEED_ISS=$(grep -oE 'iss-[a-z0-9_-]+' migrations/seed_demo.sql | sort -u | wc -l | tr -d ' ') if [ -z "$DOC_ISS" ]; then echo "::warning::Could not extract documented issuer count." elif [ "$DOC_ISS" != "$SEED_ISS" ] && [ "$((SEED_ISS - DOC_ISS))" -gt 5 ]; then # Allow up to 5pp slack — iss-* IDs appear in audit_events and # other reference tables that aren't issuer-table rows. Drift # only flags when the spread grows large. echo "::error::DRIFT — $DOC says $DOC_ISS issuers; seed_demo.sql has $SEED_ISS unique iss-* IDs (spread > 5)." exit 1 fi echo "QA-doc seed-count drift guard: clean." # Bundle Q / I-001 closure — test-naming convention guard (informational). # The convention is `Test__`. This step # prints any non-conformant tests but does NOT fail the build until the # Bundle I-001-extended (2026-04-27) — promoted from informational # to hard-fail. The convention is now: every `func TestXxx(...)` MUST # match Go's standard test-runner pattern (`^func Test[A-Z]`). Tests # whose name starts with `func Test` are silently SKIPPED # by `go test` (Go only runs `Test[A-Z]...`) — those are the real # bugs this guard catches. # # The original audit's `Test__` triple- # token prescription has been relaxed: single-function pin tests like # `TestNewAgent` or `TestSplitPEMChain` are valid Go convention, with # internal scenarios expressed via `t.Run` subtests. Requiring the # underscore-Scenario-Result triple repo-wide would mean renaming # 167 legitimate tests for no observable behavior change. The # Test__ form remains documented as # the recommended pattern for parameterized scenarios in # docs/contributor/qa-test-suite.md, but is not gated. - name: Regression guards (extracted to scripts/ci-guards/) # All named regression guards live at scripts/ci-guards/.sh per # ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally: # bash scripts/ci-guards/G-3-env-docs-drift.sh # Adding a new guard: drop a new .sh; this loop auto-picks it up. # Contract: each guard MUST exit 0 on clean repo, non-zero with # ::error:: prefix on regression. See scripts/ci-guards/README.md. run: | set -e fail=0 for g in scripts/ci-guards/*.sh; do echo "::group::$(basename "$g")" if ! bash "$g"; then fail=1 fi echo "::endgroup::" done exit $fail frontend-build: name: Frontend Build runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: '22' - name: Install Dependencies working-directory: web run: npm ci - name: TypeScript Check working-directory: web run: npx tsc --noEmit - name: Run Frontend Tests working-directory: web run: npx vitest run - name: Build Frontend working-directory: web run: npx vite build - name: Regression guards (extracted to scripts/ci-guards/) # All named regression guards live at scripts/ci-guards/.sh per # ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally: # bash scripts/ci-guards/G-3-env-docs-drift.sh # Adding a new guard: drop a new .sh; this loop auto-picks it up. # Contract: each guard MUST exit 0 on clean repo, non-zero with # ::error:: prefix on regression. See scripts/ci-guards/README.md. run: | set -e fail=0 for g in scripts/ci-guards/*.sh; do echo "::group::$(basename "$g")" if ! bash "$g"; then fail=1 fi echo "::endgroup::" done exit $fail helm-lint: name: Helm Chart Validation runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install Helm uses: azure/setup-helm@v4 with: version: '3.13.0' # HTTPS-Everywhere (v2.0.47): the chart fails render when no TLS source is # configured. Every lint/template invocation below must pick exactly one # provisioning mode — see deploy/helm/certctl/templates/_helpers.tpl # (certctl.tls.required) and docs/operator/tls.md. - name: Lint Helm Chart run: | helm lint deploy/helm/certctl/ \ --set server.tls.existingSecret=certctl-tls-ci - name: Template Helm Chart (existingSecret mode) run: | helm template certctl deploy/helm/certctl/ \ --set server.tls.existingSecret=certctl-tls-ci \ > /dev/null - name: Template Helm Chart (cert-manager mode) run: | helm template certctl deploy/helm/certctl/ \ --set server.tls.certManager.enabled=true \ --set server.tls.certManager.issuerRef.name=letsencrypt-prod \ > /dev/null - name: Template Helm Chart (guard fails without TLS) run: | # Inverse test: the chart MUST refuse to render when no TLS source is # configured. If this ever renders successfully, the fail-loud guard # in certctl.tls.required has regressed. if helm template certctl deploy/helm/certctl/ > /dev/null 2>&1; then echo "::error::Helm chart rendered without a TLS source — fail-loud guard regressed" exit 1 fi # ============================================================================= # deploy-vendor-e2e — single-job (collapsed from 12-job matrix) # ============================================================================= # Per ci-pipeline-cleanup bundle Phase 5 / frozen decision 0.4 (revises # Bundle II decision 0.9): the per-vendor matrix produced 12 status-check # rows for ~1 real assertion (115/116 vendor-edge tests are t.Log # placeholders). Collapsed to one job that brings up all 11 sidecars # at once and runs the full VendorEdge_ test set. # # Skip-detection guard (scripts/vendor-e2e-skip-check.sh) # enforces that no test SKIPs except the documented allowlist # (windows-iis-requiring tests on Linux). If a sidecar fails to come # up, requireSidecar() in deploy/test/vendor_e2e_helpers.go calls # t.Skipf() — the guard catches that. # # RAM headroom on ubuntu-latest (16 GB ceiling) — operator-confirmed # in Phase 0 / frozen decision 0.14 prototype-branch run. If RAM # regresses, fall back to bucketed matrix per # the project's frozen-decisions log. # # The Windows matrix (deploy-vendor-e2e-windows) was deleted entirely # per Phase 6 / frozen decision 0.5 (revises Bundle II decision 0.4). # IIS + WinCertStore validation moved to the operator playbook at # docs/connector-iis.md::Operator validation playbook. deploy-vendor-e2e: name: deploy-vendor-e2e runs-on: ubuntu-latest needs: [go-build-and-test] timeout-minutes: 30 steps: - uses: actions/checkout@v5 - name: Set up Go uses: actions/setup-go@v5 with: go-version: '1.25.10' cache: true - name: Build f5-mock-icontrol sidecar # The only sidecar without a published image; built from the in-tree # Go server at deploy/test/f5-mock-icontrol/. run: docker compose --profile deploy-e2e -f deploy/docker-compose.test.yml build f5-mock-icontrol - name: Bring up all vendor sidecars # Brings up the 11 deploy-e2e sidecars (apache-test, haproxy-test, # traefik-test, caddy-test, envoy-test, postfix-test, dovecot-test, # openssh-test, f5-mock-icontrol, k8s-kind-test, windows-iis-test # which is gated by a separate windows-only profile and won't # actually start) plus the always-on legacy nginx. run: | docker compose --profile deploy-e2e -f deploy/docker-compose.test.yml up -d sleep 15 - name: Run all vendor-edge e2e # Captures test output for skip-count enforcement (next step). env: INTEGRATION: "1" run: | go test -tags integration -race -count=1 -run 'VendorEdge_' \ ./deploy/test/... 2>&1 | tee test-output.log - name: Skip-count enforcement # ci-pipeline-cleanup Phase 5 / frozen decision 0.6: # requireSidecar uses t.Skipf (not t.Fatal) when a sidecar isn't # reachable — collapsing the per-vendor matrix removes the implicit # guard each per-job matrix entry provided. This step counts SKIP # lines in the test output and fails the build if it exceeds the # allowlist (windows-iis-requiring tests; legitimately skipped # on Linux per Phase 6 / frozen decision 0.5). run: bash scripts/vendor-e2e-skip-check.sh test-output.log - name: Diagnostic dump on failure # Prints container status + last 200 log lines from the certctl-server # and base-stack containers when ANY previous step in this job fails. # The matrix-collapse (Phase 5) brings up ~18 containers concurrently # (vs 1 vendor sidecar at a time pre-collapse); transient failures # surface most often as "container certctl-test-server is unhealthy" # without any visible reason because compose only reports the # dependency-chain symptom, not the root cause. Dumping logs here # makes the underlying error (DB migration crash, port bind failure, # entrypoint stall, OOM kill) visible in the GitHub Actions log # without requiring a workstation reproduction. if: failure() run: | echo "=== docker compose ps -a ===" docker compose --profile deploy-e2e -f deploy/docker-compose.test.yml ps -a || true echo "" echo "=== certctl-test-server logs (last 200 lines) ===" docker logs --tail 200 certctl-test-server 2>&1 || true echo "" echo "=== certctl-test-tls-init logs ===" docker logs certctl-test-tls-init 2>&1 || true echo "" echo "=== certctl-test-postgres logs (last 100 lines) ===" docker logs --tail 100 certctl-test-postgres 2>&1 || true echo "" echo "=== certctl-test-stepca logs (last 100 lines) ===" docker logs --tail 100 certctl-test-stepca 2>&1 || true echo "" echo "=== certctl-test-pebble logs (last 50 lines) ===" docker logs --tail 50 certctl-test-pebble 2>&1 || true echo "" echo "=== certctl-test-agent logs (last 100 lines) ===" docker logs --tail 100 certctl-test-agent 2>&1 || true - name: Tear down sidecars if: always() run: docker compose --profile deploy-e2e -f deploy/docker-compose.test.yml down -v # ============================================================================= # image-and-supply-chain — digest validity + Docker build smoke + OpenAPI parity # ============================================================================= # Per ci-pipeline-cleanup bundle Phases 7-9 / frozen decision 0.8. # Three checks bundled into one job (parallel to go-build-and-test): # 1. Digest validity — every @sha256 ref in deploy/* + Dockerfiles must # resolve on its registry. Closes the H-001 lying-field gap (H-001 # verifies digest *presence* but not *resolution* — Bundle II shipped # 11 fabricated digests that passed H-001 and failed `docker pull`). # 2. Docker build smoke — all 4 Dockerfiles in the repo must build. # Catches syntax errors / COPY path drift before tag-time release.yml. # 3. OpenAPI ↔ handler parity — every router route has a matching # operationId or is documented in api/openapi-handler-exceptions.yaml. image-and-supply-chain: name: image-and-supply-chain runs-on: ubuntu-latest timeout-minutes: 15 steps: - uses: actions/checkout@v5 - name: Set up Go uses: actions/setup-go@v5 with: go-version: '1.25.10' cache: true - name: Digest validity (every @sha256 ref must resolve) run: bash scripts/ci-guards/digest-validity.sh - name: Docker build smoke (all 4 Dockerfiles) # Per frozen decision 0.10: build all 4 Dockerfiles in the repo, # not just production server + agent. The test-sidecar Dockerfiles # are load-bearing for vendor-e2e — a syntax error there silently # breaks the e2e suite. run: | set -e docker build -f Dockerfile -t certctl:smoke . docker build -f Dockerfile.agent -t certctl-agent:smoke . docker build -f deploy/test/f5-mock-icontrol/Dockerfile -t f5-mock:smoke . docker build -f deploy/test/libest/Dockerfile -t libest:smoke . echo "All 4 Dockerfiles build clean." - name: OpenAPI ↔ handler operationId parity run: bash scripts/ci-guards/openapi-handler-parity.sh