certctl/.github/workflows/ci.yml

name: CI

on:
  push:
    branches:
      - master
      - v2-dev
  pull_request:
    branches:
      - master

jobs:
  go-build-and-test:
    name: Go Build & Test
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Go
        uses: actions/setup-go@v5
        with:
          go-version: '1.25.9'

      - name: Go Build
        run: |
          go build ./cmd/server/...
          go build ./cmd/agent/...
          go build ./cmd/mcp-server/...
          go build ./cmd/cli/...

      - name: Go Vet
        run: go vet ./...

      - name: Install golangci-lint
        run: |
          curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v2.11.4

      - name: Run golangci-lint
        run: golangci-lint run ./... --timeout 5m

      - name: Install govulncheck
        run: go install golang.org/x/vuln/cmd/govulncheck@latest

      - name: Run govulncheck
        run: govulncheck ./...

      - name: Forbidden auth-type literal regression guard (G-1)
        # G-1 closed the JWT silent auth downgrade by removing "jwt" from the
        # accepted CERTCTL_AUTH_TYPE values. This step grep-fails the build
        # if "jwt" reappears in any of the *additive* auth-type surfaces:
        # the validAuthTypes / ValidAuthTypes() set, the OpenAPI enum, the
        # helm chart's allowed-types list, or the .env.example default.
        # Comment lines and the dedicated rejection branch in config.go
        # (`c.Auth.Type == "jwt"`) are intentionally exempt — those are the
        # G-1 fix itself, not a regression.
        #
        # Connector packages (internal/connector/) are exempt because the
        # Google OAuth2 service-account JWT and step-ca provisioner one-
        # time-token JWT are external-protocol uses, unrelated to certctl's
        # own auth shape. Test files (_test.go) are exempt so negative
        # tests can pass the literal.
        #
        # See docs/upgrade-to-v2-jwt-removal.md for the closure rationale,
        # or internal/config/config.go::ValidAuthTypes for the allowed set.
        run: |
          set -e

          # Scoped patterns that indicate "jwt" being added back to an
          # allowed-set surface. Each catches a regression shape we've
          # actually seen in pre-G-1 code:
          #   - Go map/slice literal:  "jwt": true   or   "jwt",
          #   - Go switch case:        case "jwt"
          #   - YAML enum:             enum: [..., jwt, ...]   or   - jwt
          #   - .env conditional:      AUTH_TYPE.*"jwt"|=jwt$
          BAD=$(grep -rnEH \
              -e '"jwt"\s*:\s*true' \
              -e '"jwt"\s*,' \
              -e 'case\s+"jwt"' \
              -e 'enum:.*\bjwt\b' \
              -e '^\s*-\s*jwt\s*$' \
              -e 'AUTH_TYPE\s*=\s*jwt\s*$' \
              -e 'AUTH_TYPE\s*=\s*jwt\s*#' \
              -e 'auth\.type\s*=\s*jwt\s*$' \
              -e 'AuthType\("jwt"\)' \
              internal/config/ \
              internal/api/ \
              cmd/ \
              api/openapi.yaml \
              .env.example \
              deploy/.env.example \
              deploy/helm/certctl/values.yaml \
              deploy/helm/certctl/templates/ \
              2>/dev/null \
              | grep -v '_test.go' \
              | grep -vE '^\s*[^:]+:[0-9]+:\s*(//|#)' \
              | grep -v 'is no longer accepted' \
              || true)
          if [ -n "$BAD" ]; then
            echo "G-1 regression: \"jwt\" reappeared in an allowed-set surface:"
            echo "$BAD"
            echo ""
            echo "Allowed surface for 'jwt' literals: comment lines, the"
            echo "dedicated rejection branch in internal/config/config.go,"
            echo "and connector packages (Google OAuth2, step-ca)."
            echo "See docs/upgrade-to-v2-jwt-removal.md and"
            echo "internal/config/config.go::ValidAuthTypes()."
            exit 1
          fi

      - name: Forbidden api_key_hash JSON-shape regression guard (G-2)
        # G-2 closed cat-s5-apikey_leak by tagging Agent.APIKeyHash
        # `json:"-"` and adding a defense-in-depth Agent.MarshalJSON that
        # zeroes the field on the marshal-time copy. This step grep-fails
        # the build if `api_key_hash` reappears in any of the *additive*
        # JSON-emitting surfaces: a Go struct json tag in internal/domain/,
        # an OpenAPI Agent schema property, a TypeScript field declaration
        # in web/src/, or an enum-list / discriminator in handler
        # production code.
        #
        # Repository, migration, seed, service, integration-test, and
        # unit-test files are exempt — those are server-internal use
        # sites (the DB column stays, the in-memory struct field stays,
        # the auth-lookup path stays). Comment lines are exempt so the
        # G-2 closure rationale can stay in the source.
        #
        # See coverage-gap-audit-2026-04-24-v5/unified-audit.md
        # cat-s5-apikey_leak for the closure rationale, or
        # internal/domain/connector.go::Agent::MarshalJSON for the
        # redaction enforcement.
        run: |
          set -e

          # Scoped patterns that indicate api_key_hash being added back
          # to a JSON-emitting surface. Each catches a regression shape
          # that pre-G-2 actually shipped or that a future refactor
          # could plausibly introduce:
          #   - Go struct tag:           `json:"api_key_hash"`
          #   - Frontend interface:      api_key_hash[?]: string
          #   - OpenAPI schema property: api_key_hash:   (column-aligned)
          #   - YAML enum / array:       - api_key_hash
          BAD=$(grep -rnEH \
              -e 'json:"api_key_hash[",]' \
              -e '^\s*api_key_hash\??\s*:' \
              -e '^\s*-\s*api_key_hash\s*$' \
              internal/domain/ \
              internal/api/ \
              cmd/ \
              api/openapi.yaml \
              web/src/ \
              2>/dev/null \
              | grep -v '_test.go' \
              | grep -vE '^\s*[^:]+:[0-9]+:\s*(//|#)' \
              || true)
          if [ -n "$BAD" ]; then
            echo "G-2 regression: api_key_hash reappeared in a JSON-emitting surface:"
            echo "$BAD"
            echo ""
            echo "Allowed surface for api_key_hash literals: comment lines,"
            echo "the database column (migrations/), the in-memory struct"
            echo "field tagged \`json:\"-\"\`, and the repository / service"
            echo "use sites. See internal/domain/connector.go::Agent and"
            echo "coverage-gap-audit-2026-04-24-v5/unified-audit.md"
            echo "cat-s5-apikey_leak for the closure rationale."
            exit 1
          fi

      - name: Forbidden plaintext HEALTHCHECK regression guard (U-2)
        # U-2 closed cat-u-healthcheck_protocol_mismatch by switching the
        # published image's HEALTHCHECK from `curl -f http://localhost:
        # 8443/health` (always failed against the HTTPS-only listener) to
        # `curl -fsk https://localhost:8443/health`. This step grep-fails
        # the build if any Dockerfile in the repo carries the pre-U-2
        # plaintext shape — either explicitly (`http://localhost:8443/
        # health` in a HEALTHCHECK) or via the looser pattern of any
        # HEALTHCHECK that targets `http://` against the certctl server
        # port.
        #
        # Comment lines and the docs/upgrade-to-tls.md:182 expected-to-
        # fail invariant ("plaintext is gone, expect Connection refused")
        # are intentionally exempt — we DO want the upgrade-doc string
        # `http://localhost:8443/health` to remain there, since it
        # documents what operators should test for to confirm plaintext
        # is dead. The guardrail is scoped to Dockerfile* only, so docs
        # are out of its reach.
        #
        # See coverage-gap-audit-2026-04-24-v5/unified-audit.md
        # cat-u-healthcheck_protocol_mismatch for the closure rationale,
        # or deploy/test/healthcheck_test.go for the binary-image
        # contract the runtime test pins.
        run: |
          set -e

          # Patterns that catch the actual regression shapes:
          #   - HEALTHCHECK directive carrying any http:// (even if the
          #     port differs, no plaintext probe should ship).
          #   - The exact pre-U-2 string for grep-friendliness.
          BAD=$(grep -rnEH \
              -e 'HEALTHCHECK.*http://' \
              -e 'curl[^|&;]*-f[^|&;]*http://localhost:8443/health' \
              Dockerfile Dockerfile.agent Dockerfile.* 2>/dev/null \
              | grep -vE '^\s*[^:]+:[0-9]+:\s*#' \
              || true)
          if [ -n "$BAD" ]; then
            echo "U-2 regression: plaintext HEALTHCHECK reappeared in a Dockerfile:"
            echo "$BAD"
            echo ""
            echo "Allowed: HTTPS HEALTHCHECK with -k (acceptable for"
            echo "localhost-to-localhost), or non-HTTP probe shapes"
            echo "(pgrep, /proc check). See Dockerfile / Dockerfile.agent"
            echo "for the post-U-2 reference shape and"
            echo "coverage-gap-audit-2026-04-24-v5/unified-audit.md"
            echo "cat-u-healthcheck_protocol_mismatch for rationale."
            exit 1
          fi

      - name: Forbidden migration mount in compose initdb (U-3)
        # U-3 closed cat-u-seed_initdb_schema_drift (GitHub #10) by
        # eliminating the dual-source-of-truth between
        # `migrations/*.up.sql` mounted into postgres
        # `/docker-entrypoint-initdb.d/` and the same files re-applied at
        # runtime by `RunMigrations`. Pre-U-3 every new migration that
        # the seed depended on (000013 added `policy_rules.severity`,
        # 000017 renames `retry_interval_seconds`, etc.) had to be added
        # by hand to the compose mount list; missing the update crashed
        # initdb on first boot, postgres flagged unhealthy, and the
        # whole stack failed to start from a fresh clone. Post-U-3 the
        # server is the single source of truth — `RunMigrations` +
        # `RunSeed` apply everything at boot.
        #
        # This step grep-fails the build if any compose file under
        # `deploy/` re-introduces a `migrations/.*\.sql` mount into
        # `/docker-entrypoint-initdb.d`. Comments are exempt so the
        # post-fix rationale block in the compose files (which
        # documents WHY the mounts were removed) doesn't trip the guard.
        # The demo overlay's `seed_demo.sql` is the explicit exception:
        # it is tolerated only when it lives behind the
        # CERTCTL_DEMO_SEED env var (post-U-3 demo path) — bare initdb
        # mounts are NOT tolerated. The grep matches all compose
        # mount-list shapes (`-` indented, `volumes:` indented, both),
        # so any future drift surfaces here before the operator hits it
        # on a fresh clone.
        #
        # See coverage-gap-audit-2026-04-24-v5/unified-audit.md
        # cat-u-seed_initdb_schema_drift for the closure rationale, or
        # internal/repository/postgres/db.go::RunSeed for the runtime
        # contract.
        run: |
          set -e

          BAD=$(grep -rnEH \
              -e 'migrations/.*\.sql:.*docker-entrypoint-initdb' \
              -e 'seed.*\.sql:.*docker-entrypoint-initdb' \
              deploy/docker-compose.yml \
              deploy/docker-compose.test.yml \
              deploy/docker-compose.demo.yml \
              2>/dev/null \
              | grep -vE '^\s*[^:]+:[0-9]+:\s*#' \
              || true)
          if [ -n "$BAD" ]; then
            echo "U-3 regression: migration/seed mount into postgres initdb reappeared:"
            echo "$BAD"
            echo ""
            echo "The post-U-3 contract is: postgres comes up with an empty"
            echo "schema and the server applies migrations + seed at boot via"
            echo "internal/repository/postgres.RunMigrations + RunSeed. Demo"
            echo "data lives behind CERTCTL_DEMO_SEED=true (RunDemoSeed),"
            echo "not an initdb mount. See"
            echo "coverage-gap-audit-2026-04-24-v5/unified-audit.md"
            echo "cat-u-seed_initdb_schema_drift for the closure rationale."
            exit 1
          fi

      - name: Race Detection
        run: go test -race ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/scheduler/... ./internal/connector/... ./internal/crypto/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -timeout 300s

      - name: Go Test with Coverage
        run: |
          go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -cover -coverprofile=coverage.out

      - name: Check Coverage Thresholds
        run: |
          # Extract per-package coverage from test output
          echo "=== Coverage Report ==="
          go tool cover -func=coverage.out | tail -1

          # Check service layer coverage (target: 60%+)
          SERVICE_COV=$(go tool cover -func=coverage.out | grep 'internal/service' | awk '{print $NF}' | sed 's/%//' | awk '{sum+=$1; n++} END {if(n>0) printf "%.1f", sum/n; else print "0"}')
          echo "Service layer coverage: ${SERVICE_COV}%"

          # Check handler layer coverage (target: 60%+)
          HANDLER_COV=$(go tool cover -func=coverage.out | grep 'internal/api/handler' | awk '{print $NF}' | sed 's/%//' | awk '{sum+=$1; n++} END {if(n>0) printf "%.1f", sum/n; else print "0"}')
          echo "Handler layer coverage: ${HANDLER_COV}%"

          # Check domain layer coverage (target: 40%+)
          DOMAIN_COV=$(go tool cover -func=coverage.out | grep 'internal/domain' | awk '{print $NF}' | sed 's/%//' | awk '{sum+=$1; n++} END {if(n>0) printf "%.1f", sum/n; else print "0"}')
          echo "Domain layer coverage: ${DOMAIN_COV}%"

          # Check middleware layer coverage (target: 50%+)
          MIDDLEWARE_COV=$(go tool cover -func=coverage.out | grep 'internal/api/middleware' | awk '{print $NF}' | sed 's/%//' | awk '{sum+=$1; n++} END {if(n>0) printf "%.1f", sum/n; else print "0"}')
          echo "Middleware layer coverage: ${MIDDLEWARE_COV}%"

          # Check crypto package coverage (target: 85%+)
          # M-8 rationale: encryption primitives are a security-critical gate.
          # v2 format, key-derivation, fallback, and fail-closed sentinel paths
          # all need exhaustive coverage to avoid silent regressions (CWE-916 / CWE-329).
          CRYPTO_COV=$(go tool cover -func=coverage.out | grep 'internal/crypto' | awk '{print $NF}' | sed 's/%//' | awk '{sum+=$1; n++} END {if(n>0) printf "%.1f", sum/n; else print "0"}')
          echo "Crypto package coverage: ${CRYPTO_COV}%"

          # Fail if thresholds not met
          if [ "$(echo "$SERVICE_COV < 55" | bc -l)" -eq 1 ]; then
            echo "::error::Service layer coverage ${SERVICE_COV}% is below 55% threshold"
            exit 1
          fi
          if [ "$(echo "$HANDLER_COV < 60" | bc -l)" -eq 1 ]; then
            echo "::error::Handler layer coverage ${HANDLER_COV}% is below 60% threshold"
            exit 1
          fi
          if [ "$(echo "$DOMAIN_COV < 40" | bc -l)" -eq 1 ]; then
            echo "::error::Domain layer coverage ${DOMAIN_COV}% is below 40% threshold"
            exit 1
          fi
          if [ "$(echo "$MIDDLEWARE_COV < 30" | bc -l)" -eq 1 ]; then
            echo "::error::Middleware layer coverage ${MIDDLEWARE_COV}% is below 30% threshold"
            exit 1
          fi
          if [ "$(echo "$CRYPTO_COV < 85" | bc -l)" -eq 1 ]; then
            echo "::error::Crypto package coverage ${CRYPTO_COV}% is below 85% threshold"
            exit 1
          fi
          echo "Coverage thresholds passed!"

      - name: Upload Coverage Report
        uses: actions/upload-artifact@v4
        with:
          name: go-coverage
          path: coverage.out
          retention-days: 30

  frontend-build:
    name: Frontend Build
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Node.js
        uses: actions/setup-node@v4
        with:
          node-version: '22'

      - name: Install Dependencies
        working-directory: web
        run: npm ci

      - name: TypeScript Check
        working-directory: web
        run: npx tsc --noEmit

      - name: Run Frontend Tests
        working-directory: web
        run: npx vitest run

      - name: Build Frontend
        working-directory: web
        run: npx vite build

  helm-lint:
    name: Helm Chart Validation
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Install Helm
        uses: azure/setup-helm@v4
        with:
          version: '3.13.0'

      # HTTPS-Everywhere (v2.0.47): the chart fails render when no TLS source is
      # configured. Every lint/template invocation below must pick exactly one
      # provisioning mode — see deploy/helm/certctl/templates/_helpers.tpl
      # (certctl.tls.required) and docs/tls.md.
      - name: Lint Helm Chart
        run: |
          helm lint deploy/helm/certctl/ \
            --set server.tls.existingSecret=certctl-tls-ci

      - name: Template Helm Chart (existingSecret mode)
        run: |
          helm template certctl deploy/helm/certctl/ \
            --set server.tls.existingSecret=certctl-tls-ci \
            > /dev/null

      - name: Template Helm Chart (cert-manager mode)
        run: |
          helm template certctl deploy/helm/certctl/ \
            --set server.tls.certManager.enabled=true \
            --set server.tls.certManager.issuerRef.name=letsencrypt-prod \
            > /dev/null

      - name: Template Helm Chart (guard fails without TLS)
        run: |
          # Inverse test: the chart MUST refuse to render when no TLS source is
          # configured. If this ever renders successfully, the fail-loud guard
          # in certctl.tls.required has regressed.
          if helm template certctl deploy/helm/certctl/ > /dev/null 2>&1; then
            echo "::error::Helm chart rendered without a TLS source — fail-loud guard regressed"
            exit 1
          fi