mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 22:41:31 +00:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| aa6fafdee9 | |||
| 86fffa305a | |||
| e17788355b | |||
| 87213128cc | |||
| 697fa792ea | |||
| 9c1d446e40 | |||
| 3192cd15c5 | |||
| af47d19ae2 | |||
| cfc234ec42 | |||
| a91197014f |
+25
-4
@@ -13,22 +13,43 @@ POSTGRES_PASSWORD=change-me-in-production
|
||||
# Certctl Server
|
||||
# All server vars use the CERTCTL_ prefix (see internal/config/config.go)
|
||||
# ==============================================================================
|
||||
CERTCTL_DATABASE_URL=postgres://certctl:certctl@postgres:5432/certctl?sslmode=disable
|
||||
# IMPORTANT: keep the password segment of CERTCTL_DATABASE_URL in sync with
|
||||
# POSTGRES_PASSWORD above. If you deploy via `deploy/docker-compose.yml`,
|
||||
# this value is *overridden* by the compose file's
|
||||
# `postgres://certctl:${POSTGRES_PASSWORD:-certctl}@postgres:5432/...`
|
||||
# interpolation — but if you run the binary directly with this .env loaded
|
||||
# (e.g. `set -a; source .env; ./certctl-server`), update *both* lines.
|
||||
# Background: editing POSTGRES_PASSWORD after the postgres data directory
|
||||
# has been initialized once does NOT rotate the password — initdb only
|
||||
# seeds pg_authid on first boot of an empty volume. See docs/quickstart.md
|
||||
# "Warning" callout and `internal/repository/postgres/db.go::wrapPingError`
|
||||
# for the SQLSTATE 28P01 diagnostic that fires when the two drift.
|
||||
CERTCTL_DATABASE_URL=postgres://certctl:change-me-in-production@postgres:5432/certctl?sslmode=disable
|
||||
CERTCTL_SERVER_HOST=0.0.0.0
|
||||
CERTCTL_SERVER_PORT=8443
|
||||
CERTCTL_LOG_LEVEL=info
|
||||
CERTCTL_LOG_FORMAT=json
|
||||
|
||||
# Auth type: "api-key", "jwt", or "none" (for demo/development)
|
||||
# Auth type: "api-key" (production) or "none" (demo/development).
|
||||
# For JWT/OIDC, run an authenticating gateway in front of certctl
|
||||
# (oauth2-proxy / Envoy ext_authz / Traefik ForwardAuth / Pomerium) and
|
||||
# set CERTCTL_AUTH_TYPE=none on the upstream — see
|
||||
# docs/architecture.md "Authenticating-gateway pattern". G-1 removed
|
||||
# the in-process "jwt" option (no JWT middleware shipped — silent auth
|
||||
# downgrade); see docs/upgrade-to-v2-jwt-removal.md if you previously
|
||||
# set CERTCTL_AUTH_TYPE=jwt.
|
||||
CERTCTL_AUTH_TYPE=none
|
||||
# Required when CERTCTL_AUTH_TYPE is "api-key" or "jwt"
|
||||
# Required when CERTCTL_AUTH_TYPE is "api-key".
|
||||
# Generate with: openssl rand -base64 32
|
||||
# CERTCTL_AUTH_SECRET=change-me-in-production
|
||||
|
||||
# ==============================================================================
|
||||
# Certctl Agent
|
||||
# ==============================================================================
|
||||
CERTCTL_SERVER_URL=http://localhost:8443
|
||||
# HTTPS-only as of v2.2 (TLS 1.3 pinned). Agents reject http:// URLs at
|
||||
# startup. Use the docker-compose self-signed bootstrap CA bundle from
|
||||
# `deploy/test/certs/ca.crt` or supply your own via CERTCTL_SERVER_CA_BUNDLE_PATH.
|
||||
CERTCTL_SERVER_URL=https://localhost:8443
|
||||
CERTCTL_API_KEY=change-me-in-production
|
||||
CERTCTL_AGENT_NAME=local-agent
|
||||
|
||||
|
||||
@@ -44,6 +44,175 @@ jobs:
|
||||
- name: Run govulncheck
|
||||
run: govulncheck ./...
|
||||
|
||||
- name: Forbidden auth-type literal regression guard (G-1)
|
||||
# G-1 closed the JWT silent auth downgrade by removing "jwt" from the
|
||||
# accepted CERTCTL_AUTH_TYPE values. This step grep-fails the build
|
||||
# if "jwt" reappears in any of the *additive* auth-type surfaces:
|
||||
# the validAuthTypes / ValidAuthTypes() set, the OpenAPI enum, the
|
||||
# helm chart's allowed-types list, or the .env.example default.
|
||||
# Comment lines and the dedicated rejection branch in config.go
|
||||
# (`c.Auth.Type == "jwt"`) are intentionally exempt — those are the
|
||||
# G-1 fix itself, not a regression.
|
||||
#
|
||||
# Connector packages (internal/connector/) are exempt because the
|
||||
# Google OAuth2 service-account JWT and step-ca provisioner one-
|
||||
# time-token JWT are external-protocol uses, unrelated to certctl's
|
||||
# own auth shape. Test files (_test.go) are exempt so negative
|
||||
# tests can pass the literal.
|
||||
#
|
||||
# See docs/upgrade-to-v2-jwt-removal.md for the closure rationale,
|
||||
# or internal/config/config.go::ValidAuthTypes for the allowed set.
|
||||
run: |
|
||||
set -e
|
||||
|
||||
# Scoped patterns that indicate "jwt" being added back to an
|
||||
# allowed-set surface. Each catches a regression shape we've
|
||||
# actually seen in pre-G-1 code:
|
||||
# - Go map/slice literal: "jwt": true or "jwt",
|
||||
# - Go switch case: case "jwt"
|
||||
# - YAML enum: enum: [..., jwt, ...] or - jwt
|
||||
# - .env conditional: AUTH_TYPE.*"jwt"|=jwt$
|
||||
BAD=$(grep -rnEH \
|
||||
-e '"jwt"\s*:\s*true' \
|
||||
-e '"jwt"\s*,' \
|
||||
-e 'case\s+"jwt"' \
|
||||
-e 'enum:.*\bjwt\b' \
|
||||
-e '^\s*-\s*jwt\s*$' \
|
||||
-e 'AUTH_TYPE\s*=\s*jwt\s*$' \
|
||||
-e 'AUTH_TYPE\s*=\s*jwt\s*#' \
|
||||
-e 'auth\.type\s*=\s*jwt\s*$' \
|
||||
-e 'AuthType\("jwt"\)' \
|
||||
internal/config/ \
|
||||
internal/api/ \
|
||||
cmd/ \
|
||||
api/openapi.yaml \
|
||||
.env.example \
|
||||
deploy/.env.example \
|
||||
deploy/helm/certctl/values.yaml \
|
||||
deploy/helm/certctl/templates/ \
|
||||
2>/dev/null \
|
||||
| grep -v '_test.go' \
|
||||
| grep -vE '^\s*[^:]+:[0-9]+:\s*(//|#)' \
|
||||
| grep -v 'is no longer accepted' \
|
||||
|| true)
|
||||
if [ -n "$BAD" ]; then
|
||||
echo "G-1 regression: \"jwt\" reappeared in an allowed-set surface:"
|
||||
echo "$BAD"
|
||||
echo ""
|
||||
echo "Allowed surface for 'jwt' literals: comment lines, the"
|
||||
echo "dedicated rejection branch in internal/config/config.go,"
|
||||
echo "and connector packages (Google OAuth2, step-ca)."
|
||||
echo "See docs/upgrade-to-v2-jwt-removal.md and"
|
||||
echo "internal/config/config.go::ValidAuthTypes()."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Forbidden api_key_hash JSON-shape regression guard (G-2)
|
||||
# G-2 closed cat-s5-apikey_leak by tagging Agent.APIKeyHash
|
||||
# `json:"-"` and adding a defense-in-depth Agent.MarshalJSON that
|
||||
# zeroes the field on the marshal-time copy. This step grep-fails
|
||||
# the build if `api_key_hash` reappears in any of the *additive*
|
||||
# JSON-emitting surfaces: a Go struct json tag in internal/domain/,
|
||||
# an OpenAPI Agent schema property, a TypeScript field declaration
|
||||
# in web/src/, or an enum-list / discriminator in handler
|
||||
# production code.
|
||||
#
|
||||
# Repository, migration, seed, service, integration-test, and
|
||||
# unit-test files are exempt — those are server-internal use
|
||||
# sites (the DB column stays, the in-memory struct field stays,
|
||||
# the auth-lookup path stays). Comment lines are exempt so the
|
||||
# G-2 closure rationale can stay in the source.
|
||||
#
|
||||
# See coverage-gap-audit-2026-04-24-v5/unified-audit.md
|
||||
# cat-s5-apikey_leak for the closure rationale, or
|
||||
# internal/domain/connector.go::Agent::MarshalJSON for the
|
||||
# redaction enforcement.
|
||||
run: |
|
||||
set -e
|
||||
|
||||
# Scoped patterns that indicate api_key_hash being added back
|
||||
# to a JSON-emitting surface. Each catches a regression shape
|
||||
# that pre-G-2 actually shipped or that a future refactor
|
||||
# could plausibly introduce:
|
||||
# - Go struct tag: `json:"api_key_hash"`
|
||||
# - Frontend interface: api_key_hash[?]: string
|
||||
# - OpenAPI schema property: api_key_hash: (column-aligned)
|
||||
# - YAML enum / array: - api_key_hash
|
||||
BAD=$(grep -rnEH \
|
||||
-e 'json:"api_key_hash[",]' \
|
||||
-e '^\s*api_key_hash\??\s*:' \
|
||||
-e '^\s*-\s*api_key_hash\s*$' \
|
||||
internal/domain/ \
|
||||
internal/api/ \
|
||||
cmd/ \
|
||||
api/openapi.yaml \
|
||||
web/src/ \
|
||||
2>/dev/null \
|
||||
| grep -v '_test.go' \
|
||||
| grep -vE '^\s*[^:]+:[0-9]+:\s*(//|#)' \
|
||||
|| true)
|
||||
if [ -n "$BAD" ]; then
|
||||
echo "G-2 regression: api_key_hash reappeared in a JSON-emitting surface:"
|
||||
echo "$BAD"
|
||||
echo ""
|
||||
echo "Allowed surface for api_key_hash literals: comment lines,"
|
||||
echo "the database column (migrations/), the in-memory struct"
|
||||
echo "field tagged \`json:\"-\"\`, and the repository / service"
|
||||
echo "use sites. See internal/domain/connector.go::Agent and"
|
||||
echo "coverage-gap-audit-2026-04-24-v5/unified-audit.md"
|
||||
echo "cat-s5-apikey_leak for the closure rationale."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Forbidden plaintext HEALTHCHECK regression guard (U-2)
|
||||
# U-2 closed cat-u-healthcheck_protocol_mismatch by switching the
|
||||
# published image's HEALTHCHECK from `curl -f http://localhost:
|
||||
# 8443/health` (always failed against the HTTPS-only listener) to
|
||||
# `curl -fsk https://localhost:8443/health`. This step grep-fails
|
||||
# the build if any Dockerfile in the repo carries the pre-U-2
|
||||
# plaintext shape — either explicitly (`http://localhost:8443/
|
||||
# health` in a HEALTHCHECK) or via the looser pattern of any
|
||||
# HEALTHCHECK that targets `http://` against the certctl server
|
||||
# port.
|
||||
#
|
||||
# Comment lines and the docs/upgrade-to-tls.md:182 expected-to-
|
||||
# fail invariant ("plaintext is gone, expect Connection refused")
|
||||
# are intentionally exempt — we DO want the upgrade-doc string
|
||||
# `http://localhost:8443/health` to remain there, since it
|
||||
# documents what operators should test for to confirm plaintext
|
||||
# is dead. The guardrail is scoped to Dockerfile* only, so docs
|
||||
# are out of its reach.
|
||||
#
|
||||
# See coverage-gap-audit-2026-04-24-v5/unified-audit.md
|
||||
# cat-u-healthcheck_protocol_mismatch for the closure rationale,
|
||||
# or deploy/test/healthcheck_test.go for the binary-image
|
||||
# contract the runtime test pins.
|
||||
run: |
|
||||
set -e
|
||||
|
||||
# Patterns that catch the actual regression shapes:
|
||||
# - HEALTHCHECK directive carrying any http:// (even if the
|
||||
# port differs, no plaintext probe should ship).
|
||||
# - The exact pre-U-2 string for grep-friendliness.
|
||||
BAD=$(grep -rnEH \
|
||||
-e 'HEALTHCHECK.*http://' \
|
||||
-e 'curl[^|&;]*-f[^|&;]*http://localhost:8443/health' \
|
||||
Dockerfile Dockerfile.agent Dockerfile.* 2>/dev/null \
|
||||
| grep -vE '^\s*[^:]+:[0-9]+:\s*#' \
|
||||
|| true)
|
||||
if [ -n "$BAD" ]; then
|
||||
echo "U-2 regression: plaintext HEALTHCHECK reappeared in a Dockerfile:"
|
||||
echo "$BAD"
|
||||
echo ""
|
||||
echo "Allowed: HTTPS HEALTHCHECK with -k (acceptable for"
|
||||
echo "localhost-to-localhost), or non-HTTP probe shapes"
|
||||
echo "(pgrep, /proc check). See Dockerfile / Dockerfile.agent"
|
||||
echo "for the post-U-2 reference shape and"
|
||||
echo "coverage-gap-audit-2026-04-24-v5/unified-audit.md"
|
||||
echo "cat-u-healthcheck_protocol_mismatch for rationale."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Race Detection
|
||||
run: go test -race ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/scheduler/... ./internal/connector/... ./internal/crypto/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -timeout 300s
|
||||
|
||||
|
||||
@@ -2,6 +2,63 @@
|
||||
|
||||
All notable changes to certctl are documented in this file. Dates use ISO 8601. Versions follow [Semantic Versioning](https://semver.org/).
|
||||
|
||||
## [unreleased] — 2026-04-24
|
||||
|
||||
### G-1: JWT silent auth downgrade — closed end-to-end
|
||||
|
||||
> Pre-G-1 the config validator accepted `CERTCTL_AUTH_TYPE=jwt` and the startup log faithfully echoed `"authentication enabled" "type"="jwt"`. Reasonable people read that and concluded JWT was on. It wasn't. The auth-middleware wiring at `cmd/server/main.go` unconditionally routed every request through the api-key bearer middleware regardless of `cfg.Auth.Type`. So `CERTCTL_AUTH_TYPE=jwt` quietly compared incoming `Authorization: Bearer <something>` against whatever string the operator put in `CERTCTL_AUTH_SECRET` — real JWT clients got 401, and operators who treated `CERTCTL_AUTH_SECRET` as a *signing* secret (because they thought they were configuring JWT) had effectively handed an attacker an api-key. A security finding masquerading as a config option. We chose to remove the option rather than ship JWT middleware — the audit-recommended structural fix that closes the hazard. Operators who actually need JWT/OIDC front certctl with an authenticating gateway (oauth2-proxy / Envoy `ext_authz` / Traefik `ForwardAuth` / Pomerium / Authelia) and run the upstream certctl with `CERTCTL_AUTH_TYPE=none`. The same pattern works on docker-compose and Helm.
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
- **`CERTCTL_AUTH_TYPE=jwt` is no longer accepted.** Pre-G-1 the value was silently downgraded to api-key middleware. Post-G-1 the server fails at startup with a dedicated diagnostic naming the authenticating-gateway pattern. Operators with this in their env block must either switch to `api-key` (if they were de facto using api-key auth all along — same Bearer token continues to work) or switch to `none` and front certctl with an oauth2-proxy / Envoy / Traefik / Pomerium gateway. See [`docs/upgrade-to-v2-jwt-removal.md`](docs/upgrade-to-v2-jwt-removal.md).
|
||||
- **Helm chart `server.auth.type=jwt` now fails at `helm install` / `helm upgrade` template time.** New `certctl.validateAuthType` template helper runs on every template that depends on `.Values.server.auth.type` (`server-deployment.yaml`, `server-configmap.yaml`, `server-secret.yaml`) and fails the render with a pointer at the gateway-fronting pattern.
|
||||
- **OpenAPI spec `auth_type` enum no longer includes `jwt`.** API consumers checking `/api/v1/auth/info` against the spec will see a smaller enum.
|
||||
|
||||
### Removed
|
||||
|
||||
- Documented references to JWT in the certctl auth surface (config docblocks, middleware/health-handler comments, `.env.example`, `docs/architecture.md` middleware-stack bullet). Connector-level JWT references (Google OAuth2 service-account JWT in `internal/connector/discovery/gcpsm/`, `internal/connector/issuer/googlecas/`; step-ca's provisioner one-time-token JWT in `internal/connector/issuer/stepca/`) are unrelated and untouched — those are external-protocol uses, not certctl's own auth shape.
|
||||
|
||||
### Added
|
||||
|
||||
- **`config.AuthType` typed alias** with `AuthTypeAPIKey` / `AuthTypeNone` exported constants. Single source of truth for the allowed set across the validator, the runtime defense-in-depth switch in `main.go`, and the helm chart's `validateAuthType` helper.
|
||||
- **`config.ValidAuthTypes()`** helper returning the complete allowed set; pinned by a property test (`TestValidAuthTypesDoesNotContainJWT`) that fails the build if `"jwt"` is ever re-added to the slice.
|
||||
- **Defense-in-depth runtime guard** in `cmd/server/main.go` immediately after `config.Load()` — a `switch config.AuthType(cfg.Auth.Type)` that exits 1 if the validator was bypassed (test harness, alt config loader, env-var rebinding).
|
||||
- **`certctl.validateAuthType` Helm template helper** mirroring the existing `certctl.tls.required` pattern. Fails template render on any `server.auth.type` outside `{api-key, none}`.
|
||||
- **`docs/architecture.md` "Authenticating-gateway pattern (JWT, OIDC, mTLS)"** section explaining the design rationale for the narrow in-process auth surface and listing oauth2-proxy / Envoy `ext_authz` / Traefik `ForwardAuth` / Pomerium / Authelia / Caddy `forward_auth` / Apache `mod_auth_openidc` / nginx `auth_request` as the standard fronting options.
|
||||
- **`docs/upgrade-to-v2-jwt-removal.md`** migration guide. Same shape as `docs/upgrade-to-tls.md`. Walks through the dedicated startup error, both recovery paths (`api-key` vs gateway-fronting), a complete docker-compose oauth2-proxy walkthrough, Traefik ForwardAuth and Envoy `ext_authz` patterns, and rollback posture.
|
||||
- **`deploy/helm/certctl/README.md`** "JWT / OIDC via authenticating gateway" section with a Kubernetes-flavored oauth2-proxy + certctl walkthrough.
|
||||
- **CI regression guardrail** in `.github/workflows/ci.yml` (`Forbidden auth-type literal regression guard (G-1)`) — grep-fails the build if `"jwt"` appears as an auth-type literal in production code or spec. Connector packages exempt (legitimate external-protocol uses).
|
||||
- **Negative test coverage** in `internal/config/config_test.go`: `TestValidate_JWTAuth_RejectedDedicated` (two table rows pinning that the dedicated G-1 error fires regardless of whether `Secret` is set), `TestValidAuthTypesDoesNotContainJWT` (property-level guard), `TestValidAuthTypesIsExactly_APIKey_None` (allowed-set contract), `TestValidate_GenericInvalidAuthType` (pins that other invalid values still surface the generic invalid-auth-type error, so the dedicated G-1 path doesn't accidentally swallow non-jwt typos).
|
||||
|
||||
### Changed
|
||||
|
||||
- `internal/api/middleware/middleware.go::AuthConfig.Type` field comment now references the typed `config.AuthType` constants instead of an inline string enumeration.
|
||||
- `internal/api/handler/health.go::HealthHandler.AuthType` field comment same treatment.
|
||||
- `internal/api/handler/health_test.go` — the prior `TestAuthInfo_ReturnsAuthType_JWT` (which asserted the handler echoed `"jwt"`, baking the silent-downgrade lie into the regression suite) is removed; the pre-existing `TestAuthInfo_ReturnsAuthType_APIKey` continues to cover the api-key happy path.
|
||||
- Auth-disabled startup log in `main.go` now points operators at the authenticating-gateway pattern explicitly.
|
||||
|
||||
### U-2: Dockerfile HEALTHCHECK protocol mismatch — closed end-to-end
|
||||
|
||||
> Pre-U-2 the published `ghcr.io/shankar0123/certctl-server` image shipped with `HEALTHCHECK CMD curl -f http://localhost:8443/health`. The server has been HTTPS-only since the v2.2 HTTPS-Everywhere milestone (`cmd/server/main.go::ListenAndServeTLS`, no plaintext fallback, TLS 1.3 pinned), so the probe failed every interval and Docker marked the container `unhealthy` indefinitely. Operators inside docker-compose / Helm / the example stacks were unaffected — compose overrides the HEALTHCHECK with `--cacert + https://`, Helm uses explicit `httpGet` probes that ignore Docker's HEALTHCHECK, and every example compose file overrides with `curl -sfk https://localhost:8443/health`. But anyone running bare `docker run` / Docker Swarm / Nomad / ECS — exactly the "I just pulled the published image" path — saw permanent `unhealthy` status and (depending on orchestrator policy) a restart-loop. Recon for U-2 also surfaced two adjacent bugs from the same v2.2 milestone gap: the Helm chart's `readinessProbe.httpGet.path` pointed at `/readyz`, a route the server doesn't register (only `/health` and `/ready` are wired and bypass the auth middleware), so K8s readiness probes were getting 404/auth-rejection and pods stayed `NotReady`; and the agent image had no HEALTHCHECK at all (the compose override called `pgrep -f certctl-agent` against an image that didn't ship `procps` — latent always-fail). All three are closed in this commit.
|
||||
|
||||
### Fixed
|
||||
|
||||
- **`Dockerfile` HEALTHCHECK now speaks HTTPS.** Bare `docker run` / Swarm / Nomad / ECS users no longer see `unhealthy` forever. The probe uses `curl -fsk https://localhost:8443/health` — `-k` (insecure) is acceptable because the probe is localhost-to-localhost: the same process serving the cert is being probed; the probe never traverses a network. Compose / Helm / examples already perform full cert-chain validation and are unaffected.
|
||||
- **Helm `server.readinessProbe.httpGet.path` corrected from `/readyz` to `/ready`.** The `/readyz` path was never registered as a no-auth route (see `internal/api/router/router.go:81` and `cmd/server/main.go:920`), so K8s readiness probes received 401 (api-key auth rejection) or 404 (when auth was disabled). Pods previously failed to report Ready under most realistic Helm deployments. Liveness probe path (`/health`) was already correct and is unchanged.
|
||||
- **`docs/connectors.md` curl examples** (15 sites) updated from `http://localhost:8443/...` to `https://localhost:8443/...` with a one-time `--cacert "$CA"` extraction note matching the existing pattern in `docs/quickstart.md`. Pre-U-2 these examples silently failed against the HTTPS listener.
|
||||
|
||||
### Added
|
||||
|
||||
- **`Dockerfile.agent` HEALTHCHECK** — `pgrep -f certctl-agent` process-presence check (the agent has no HTTP listener; presence is the right primitive). Bare-`docker run` agents now report health-status the same way compose-managed ones do. Also adds `procps` to the runtime image so `pgrep` is actually available — pre-U-2 the docker-compose override at `deploy/docker-compose.yml:173` called `pgrep -f certctl-agent` against an image that lacked it (latent always-fail; container was reported unhealthy in compose too, just rarely noticed because nothing acted on the signal).
|
||||
- **`deploy/test/healthcheck_test.go`** (`//go:build integration`) — image-level integration tests. `TestPublishedServerImage_HealthcheckSpecUsesHTTPS` builds the server image, inspects `Config.Healthcheck.Test` via `docker inspect`, and asserts the array contains `https://localhost:8443/health` and `-k`, and does NOT contain `http://localhost:8443/health` (negative regression contract). `TestPublishedAgentImage_HealthcheckSpecExists` builds the agent image and asserts the HEALTHCHECK uses `pgrep` against `certctl-agent`. Both tests `t.Skip` cleanly when docker isn't available (sandbox / CI without docker-in-docker). A third runtime test (`TestPublishedServerImage_HealthcheckTransitionsToHealthy`) is a `t.Skip` placeholder until the harness wires a sidecar postgres for image-level smoke — documented honestly so the next refactor adopts it instead of rediscovering the gap.
|
||||
- **CI regression guardrail** in `.github/workflows/ci.yml` (`Forbidden plaintext HEALTHCHECK regression guard (U-2)`) — grep-fails the build if any `Dockerfile*` carries `HEALTHCHECK.*http://` or `curl -f http://localhost:8443/health`. Comments exempt; the `docs/upgrade-to-tls.md:182` post-cutover invariant string (which deliberately documents the expected-failure shape) is out of the guardrail's scope because the guardrail only scans Dockerfiles.
|
||||
|
||||
### Changed
|
||||
|
||||
- `Dockerfile` final-stage HEALTHCHECK lines now carry a long-form docblock explaining the `-k` design choice, the published-image vs compose vs Helm vs examples coverage matrix, and cross-references to the audit closure + the integration test.
|
||||
- `Dockerfile.agent` runtime stage adds `procps` to the apk install so the new HEALTHCHECK and the existing compose override both have a working `pgrep`.
|
||||
- `deploy/helm/certctl/values.yaml` server probes block now carries an explanatory comment naming the registered probe routes (`/health`, `/ready`) and the U-2 closure rationale for the `/readyz` → `/ready` correction.
|
||||
|
||||
## [2.2.0] — 2026-04-19
|
||||
|
||||
### HTTPS Everywhere — The Irony
|
||||
|
||||
+28
-1
@@ -76,7 +76,34 @@ USER certctl
|
||||
|
||||
EXPOSE 8443
|
||||
|
||||
# Image-level HEALTHCHECK for bare `docker run` / Docker Swarm / Nomad / ECS.
|
||||
#
|
||||
# U-2 (P1, cat-u-healthcheck_protocol_mismatch): pre-U-2 this probe used
|
||||
# `curl -f http://localhost:8443/health`, which always failed against the
|
||||
# HTTPS-only listener (HTTPS-Everywhere milestone, v2.2 / tag v2.0.47 —
|
||||
# `cmd/server/main.go::ListenAndServeTLS`, no plaintext fallback, TLS 1.3
|
||||
# pinned). Operators outside docker-compose / Helm saw permanent
|
||||
# `unhealthy` status and a restart-loop the first time they pulled the
|
||||
# image. The compose stack overrides this HEALTHCHECK with `--cacert` to
|
||||
# the bootstrap CA bundle (deploy/docker-compose.yml:126); the Helm chart
|
||||
# uses explicit `httpGet` probes with `scheme: HTTPS` and ignores Docker's
|
||||
# HEALTHCHECK; every example compose file in `examples/*/docker-compose.yml`
|
||||
# overrides with `curl -sfk https://localhost:8443/health`. This image-
|
||||
# level probe is for the bare-`docker run` consumer ONLY.
|
||||
#
|
||||
# `-k` (insecure) is acceptable here because the probe is localhost-to-
|
||||
# localhost: the same process serving the cert is being probed; the probe
|
||||
# never traverses a network. Pinning a `--cacert` is not viable for the
|
||||
# published image because the bootstrap cert is per-deploy (generated into
|
||||
# the `certs` named volume on first up; operator-supplied via Helm's
|
||||
# `existingSecret` or cert-manager). Compose / Helm / examples already
|
||||
# perform full cert-chain validation and are unaffected.
|
||||
#
|
||||
# CI grep guardrail at .github/workflows/ci.yml ("Forbidden plaintext
|
||||
# HEALTHCHECK regression guard (U-2)") blocks reintroduction of the
|
||||
# `http://` shape. Image-level integration test in
|
||||
# deploy/test/healthcheck_test.go pins the contract end-to-end.
|
||||
HEALTHCHECK --interval=10s --timeout=5s --start-period=5s --retries=5 \
|
||||
CMD curl -f http://localhost:8443/health || exit 1
|
||||
CMD curl -fsk https://localhost:8443/health || exit 1
|
||||
|
||||
ENTRYPOINT ["/app/server"]
|
||||
|
||||
+23
-1
@@ -36,7 +36,14 @@ RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build \
|
||||
# Stage 2: Runtime
|
||||
FROM alpine:3.19
|
||||
|
||||
RUN apk add --no-cache ca-certificates curl
|
||||
# U-2: `procps` ships pgrep, which the HEALTHCHECK below uses to verify the
|
||||
# agent process is alive. Pre-U-2 the deploy/docker-compose.yml agent
|
||||
# HEALTHCHECK called `pgrep -f certctl-agent` against this image but
|
||||
# pgrep wasn't installed — the compose probe was a latent always-fail.
|
||||
# Adding procps here fixes both the new image-level HEALTHCHECK and the
|
||||
# pre-existing compose override. Adds ~250KB to the image; acceptable for
|
||||
# observability parity with the server image.
|
||||
RUN apk add --no-cache ca-certificates curl procps
|
||||
|
||||
RUN addgroup -g 1000 certctl && \
|
||||
adduser -D -u 1000 -G certctl certctl
|
||||
@@ -51,4 +58,19 @@ RUN mkdir -p /var/lib/certctl/keys && \
|
||||
|
||||
USER certctl
|
||||
|
||||
# Image-level HEALTHCHECK for bare `docker run` / Docker Swarm / Nomad / ECS.
|
||||
#
|
||||
# U-2 (P1, cat-u-healthcheck_protocol_mismatch — adjacent fix): the agent
|
||||
# has no HTTP listener (it polls the server via outbound HTTPS), so a
|
||||
# process-presence check is the correct primitive. Pre-U-2 the agent image
|
||||
# shipped with no HEALTHCHECK at all, so bare-`docker run` operators got
|
||||
# zero health signal and orchestrators that key off Docker's HEALTHCHECK
|
||||
# (Swarm, Nomad, ECS) saw the container reported as `none`. The compose
|
||||
# override at deploy/docker-compose.yml:173 used the same `pgrep -f
|
||||
# certctl-agent` shape; we mirror it here so the published image has
|
||||
# parity with the compose stack and the override on docker-compose.yml
|
||||
# becomes redundant-but-correct rather than load-bearing.
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
||||
CMD pgrep -f certctl-agent > /dev/null || exit 1
|
||||
|
||||
ENTRYPOINT ["/app/agent"]
|
||||
|
||||
+20
-3
@@ -132,7 +132,14 @@ paths:
|
||||
properties:
|
||||
auth_type:
|
||||
type: string
|
||||
enum: [api-key, jwt, none]
|
||||
# G-1 (P1): "jwt" removed from this enum after the silent
|
||||
# auth downgrade was identified — no JWT middleware ships
|
||||
# with certctl. Operators who need JWT/OIDC front certctl
|
||||
# with an authenticating gateway (oauth2-proxy / Envoy /
|
||||
# Traefik / Pomerium) and set CERTCTL_AUTH_TYPE=none
|
||||
# upstream. See docs/architecture.md "Authenticating-
|
||||
# gateway pattern".
|
||||
enum: [api-key, none]
|
||||
required:
|
||||
type: boolean
|
||||
|
||||
@@ -3680,8 +3687,18 @@ components:
|
||||
registered_at:
|
||||
type: string
|
||||
format: date-time
|
||||
api_key_hash:
|
||||
type: string
|
||||
# G-2 (P1): the `api_key_hash` field was REMOVED from this
|
||||
# schema after cat-s5-apikey_leak audit closure. The DB column
|
||||
# still exists (migrations/000001_initial_schema.up.sql) and
|
||||
# the server still populates the in-memory struct for the
|
||||
# auth-lookup path (repository.AgentRepository::GetByAPIKey),
|
||||
# but the JSON wire shape no longer carries it — see
|
||||
# internal/domain/connector.go::Agent::APIKeyHash + MarshalJSON
|
||||
# for the redaction enforcement and docs/architecture.md ER
|
||||
# diagram for the database-vs-API distinction. Do NOT re-add
|
||||
# the field here without first removing the JSON-shape redaction
|
||||
# in the domain package; the CI guardrail at
|
||||
# .github/workflows/ci.yml will block re-introduction either way.
|
||||
os:
|
||||
type: string
|
||||
architecture:
|
||||
|
||||
+23
-3
@@ -39,6 +39,26 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Defense-in-depth runtime guard for the auth-type discriminator.
|
||||
//
|
||||
// G-1 (P1): config.Load() already runs Validate() which rejects "jwt"
|
||||
// and any value outside config.ValidAuthTypes() with a dedicated
|
||||
// diagnostic. This switch is belt-and-braces — if a future refactor
|
||||
// bypasses the validator (test harness, alt config loader, env-var
|
||||
// rebinding after Load) the server must not silently boot with an
|
||||
// unsupported auth shape. The error path uses fmt.Fprintf because
|
||||
// the slog logger is constructed from cfg below this point; we want
|
||||
// the failure to be visible regardless of log-level configuration.
|
||||
switch config.AuthType(cfg.Auth.Type) {
|
||||
case config.AuthTypeAPIKey, config.AuthTypeNone:
|
||||
// ok — fall through
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"unsupported auth type at runtime: %q (valid: %v) — config validation should have caught this; refusing to start\n",
|
||||
cfg.Auth.Type, config.ValidAuthTypes())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Set up structured logging
|
||||
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||
Level: cfg.GetLogLevel(),
|
||||
@@ -615,7 +635,7 @@ func main() {
|
||||
// compatibility CERTCTL_AUTH_SECRET is synthesized into legacy-key-N
|
||||
// entries with Admin=false.
|
||||
var namedKeys []middleware.NamedAPIKey
|
||||
if cfg.Auth.Type != "none" {
|
||||
if config.AuthType(cfg.Auth.Type) != config.AuthTypeNone {
|
||||
// Translate typed config.NamedAPIKey -> middleware.NamedAPIKey. The
|
||||
// two structs are field-compatible but live in different packages to
|
||||
// preserve the config→middleware dependency direction.
|
||||
@@ -704,8 +724,8 @@ func main() {
|
||||
logger.Info("rate limiting enabled", "rps", cfg.RateLimit.RPS, "burst", cfg.RateLimit.BurstSize)
|
||||
}
|
||||
|
||||
if cfg.Auth.Type == "none" {
|
||||
logger.Warn("authentication disabled (CERTCTL_AUTH_TYPE=none) — not suitable for production")
|
||||
if config.AuthType(cfg.Auth.Type) == config.AuthTypeNone {
|
||||
logger.Warn("authentication disabled (CERTCTL_AUTH_TYPE=none) — not suitable for production except behind an authenticating gateway (oauth2-proxy / Envoy ext_authz / Traefik ForwardAuth / Pomerium)")
|
||||
} else {
|
||||
logger.Info("authentication enabled", "type", cfg.Auth.Type)
|
||||
}
|
||||
|
||||
@@ -122,6 +122,8 @@ The `volumes` section mounts 10 migration files into PostgreSQL's init directory
|
||||
|
||||
**Expert note:** The numbered prefix pattern (`001_`, `002_`, ..., `020_`) ensures deterministic execution order. All migrations use `IF NOT EXISTS` and `ON CONFLICT DO NOTHING` for idempotency, so re-running them against an existing database is safe.
|
||||
|
||||
**Stateful volume — first-boot password binding (U-1).** The same "first boot only" semantics that govern migration scripts also govern `POSTGRES_PASSWORD`. The official `postgres` image runs `initdb` exactly once — when `/var/lib/postgresql/data` is empty — and that pass is the only time `POSTGRES_PASSWORD` is written into `pg_authid`. On every subsequent boot, the postgres container ignores the env var and authenticates against whatever password was baked into the data directory on the original `up`. Editing `POSTGRES_PASSWORD` in `.env` after a successful first boot therefore only updates the **certctl-server** container's `CERTCTL_DATABASE_URL` — postgres still expects the previous password, and the server fails to ping with `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). The certctl-server container surfaces this case explicitly: when SQLSTATE 28P01 fires at startup, the wrap text in `internal/repository/postgres/db.go::wrapPingError` points operators at the two remediation paths — destructive volume teardown via `docker compose -f deploy/docker-compose.yml down -v && up -d --build`, or non-destructive in-place rotation via `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"` followed by a server restart with the matching `POSTGRES_PASSWORD`. Use the destructive path on the demo / first-time setup; use the non-destructive path on any environment that holds data you want to keep.
|
||||
|
||||
#### certctl Server
|
||||
|
||||
```yaml
|
||||
|
||||
@@ -246,8 +246,8 @@ helm install certctl certctl/ \
|
||||
|--------|---------|-------------|
|
||||
| `server.replicas` | 1 | Number of server replicas |
|
||||
| `server.port` | 8443 | Server port |
|
||||
| `server.auth.type` | api-key | Authentication type |
|
||||
| `server.auth.apiKey` | "" | API key (REQUIRED) |
|
||||
| `server.auth.type` | api-key | Authentication type — `api-key` or `none` (G-1: `jwt` removed; for JWT/OIDC use a fronting authenticating gateway, see `docs/architecture.md` and `docs/upgrade-to-v2-jwt-removal.md`) |
|
||||
| `server.auth.apiKey` | "" | API key (REQUIRED when `auth.type=api-key`) |
|
||||
| `server.logging.level` | info | Log level |
|
||||
| `server.logging.format` | json | Log format |
|
||||
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
# certctl Helm Chart
|
||||
|
||||
Production-ready Helm chart for deploying [certctl](https://github.com/shankar0123/certctl) on Kubernetes. Wires up the certctl server (Deployment), PostgreSQL (StatefulSet with PVC), and the agent (DaemonSet — one per node) on a private cluster, with health probes, security contexts, and optional Ingress.
|
||||
|
||||
## Quick install
|
||||
|
||||
```bash
|
||||
helm install certctl deploy/helm/certctl/ \
|
||||
--create-namespace --namespace certctl \
|
||||
--set server.auth.apiKey="$(openssl rand -base64 32)" \
|
||||
--set postgresql.auth.password="$(openssl rand -base64 24)"
|
||||
```
|
||||
|
||||
This brings up:
|
||||
|
||||
- `<release>-server` Deployment (HTTPS-only on port 8443; TLS 1.3)
|
||||
- `<release>-postgres` StatefulSet (PostgreSQL 16-alpine, 1 replica, 10Gi PVC by default)
|
||||
- `<release>-agent` DaemonSet (polls server, generates ECDSA P-256 keys locally)
|
||||
- Service objects, optional Ingress, and ServiceAccount with RBAC
|
||||
|
||||
See [`values.yaml`](values.yaml) for the full configuration surface — issuer settings, target connectors, scheduler intervals, notifier credentials, and resource requests/limits all live there.
|
||||
|
||||
## Operational notes
|
||||
|
||||
### Postgres password rotation — read this before changing `postgresql.auth.password`
|
||||
|
||||
**The trap.** `postgresql.auth.password` is bound to `pg_authid` exactly once — when the StatefulSet's PVC is provisioned and `initdb` runs. The official `postgres:16-alpine` image only runs `initdb` when `/var/lib/postgresql/data` is empty, so on every subsequent rollout the `POSTGRES_PASSWORD` env var is read into the container but **ignored** by postgres itself. The certctl-server container also picks up the new value (via the database URL helper template), so the two halves diverge: server presents the new password, postgres still expects the old one.
|
||||
|
||||
**Symptom.** The certctl-server pod's startup log shows:
|
||||
|
||||
```
|
||||
failed to ping database: postgres rejected the configured credentials
|
||||
(SQLSTATE 28P01 — invalid_password). If you recently rotated POSTGRES_PASSWORD ...
|
||||
```
|
||||
|
||||
That diagnostic is emitted by `internal/repository/postgres/db.go::wrapPingError` — it points operators at the two remediation paths below.
|
||||
|
||||
**Remediation, non-destructive (preferred for any environment with real data):**
|
||||
|
||||
```bash
|
||||
# 1. Rotate the password in postgres directly
|
||||
kubectl -n certctl exec -it <release>-postgres-0 -- \
|
||||
psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new-password>';"
|
||||
|
||||
# 2. Update the secret / Helm values to the same value
|
||||
helm upgrade <release> deploy/helm/certctl/ \
|
||||
--reuse-values \
|
||||
--set postgresql.auth.password='<new-password>'
|
||||
|
||||
# 3. Bounce the certctl-server pod so it re-reads the secret
|
||||
kubectl -n certctl rollout restart deployment/<release>-server
|
||||
```
|
||||
|
||||
**Remediation, destructive (DESTROYS ALL CERTCTL DATA — only acceptable on dev/demo clusters):**
|
||||
|
||||
```bash
|
||||
helm uninstall <release> -n certctl
|
||||
kubectl -n certctl delete pvc -l \
|
||||
app.kubernetes.io/name=certctl,app.kubernetes.io/component=postgres
|
||||
helm install <release> deploy/helm/certctl/ \
|
||||
--namespace certctl \
|
||||
--set postgresql.auth.password='<new-password>'
|
||||
```
|
||||
|
||||
The PVC re-creates empty, `initdb` runs on first boot of the new postgres pod, and `pg_authid` is seeded with the new password.
|
||||
|
||||
**Why we don't fix this in the chart.** The env-vs-`pg_authid` divergence is intrinsic to how the upstream `postgres` image bootstraps — `initdb` is run-once-per-empty-data-dir, and there is no upstream-supported way to make subsequent boots re-seed `pg_authid` from `POSTGRES_PASSWORD`. The ergonomic answer is the runtime diagnostic plus this operational note.
|
||||
|
||||
**Cross-references.** Same root cause is documented for the docker-compose path in [`docs/quickstart.md`](../../../docs/quickstart.md) (Warning callout after the `cp .env.example .env` block) and in [`deploy/ENVIRONMENTS.md`](../../ENVIRONMENTS.md) (Stateful volume — first-boot password binding section). The runtime diagnostic itself lives in `internal/repository/postgres/db.go::wrapPingError` with regression coverage in `internal/repository/postgres/db_test.go`.
|
||||
|
||||
### Server API key rotation
|
||||
|
||||
Unlike the postgres password, `server.auth.apiKey` accepts a comma-separated list, so zero-downtime rotation is straightforward:
|
||||
|
||||
```bash
|
||||
# 1. Add the new key alongside the old
|
||||
helm upgrade <release> deploy/helm/certctl/ \
|
||||
--reuse-values \
|
||||
--set server.auth.apiKey='new-key,old-key'
|
||||
|
||||
# 2. Roll your agents / clients over to the new key
|
||||
|
||||
# 3. Remove the old key
|
||||
helm upgrade <release> deploy/helm/certctl/ \
|
||||
--reuse-values \
|
||||
--set server.auth.apiKey='new-key'
|
||||
```
|
||||
|
||||
### JWT / OIDC via authenticating gateway
|
||||
|
||||
certctl's in-process auth surface is intentionally narrow: `server.auth.type=api-key` for production deployments and `server.auth.type=none` for development. There is no in-process JWT, OIDC, mTLS, or SAML middleware. (`server.auth.type=jwt` was accepted pre-G-1 but silently routed every request through the api-key bearer middleware — silent auth downgrade. The chart now fails at `helm install`/`helm upgrade` template time via the `certctl.validateAuthType` helper if you set it. See [`../../../docs/upgrade-to-v2-jwt-removal.md`](../../../docs/upgrade-to-v2-jwt-removal.md) if you previously had this in your values.)
|
||||
|
||||
For deployments that need JWT/OIDC, the canonical Kubernetes-flavored shape is to put oauth2-proxy in front of the certctl Service, attach an authenticating Ingress middleware, and run certctl with `server.auth.type=none`:
|
||||
|
||||
```bash
|
||||
# 1. Install oauth2-proxy (or any OIDC-terminating sidecar) in the same namespace
|
||||
helm install oauth2-proxy oauth2-proxy/oauth2-proxy \
|
||||
--namespace certctl \
|
||||
--set config.clientID="$OIDC_CLIENT_ID" \
|
||||
--set config.clientSecret="$OIDC_CLIENT_SECRET" \
|
||||
--set config.cookieSecret="$(openssl rand -base64 32)" \
|
||||
--set config.configFile='|
|
||||
provider = "oidc"
|
||||
oidc_issuer_url = "https://your-issuer/"
|
||||
upstreams = ["http://<release>-server.certctl.svc.cluster.local:8443"]
|
||||
pass_authorization_header = true
|
||||
set_authorization_header = true
|
||||
email_domains = ["*"]
|
||||
'
|
||||
|
||||
# 2. Install certctl with type=none (gateway terminates auth)
|
||||
helm install certctl deploy/helm/certctl/ \
|
||||
--namespace certctl \
|
||||
--set server.auth.type=none \
|
||||
--set postgresql.auth.password="$(openssl rand -base64 24)"
|
||||
|
||||
# 3. Attach an Ingress that routes through oauth2-proxy
|
||||
# (Traefik ForwardAuth, nginx auth_request, Envoy ext_authz, etc.)
|
||||
```
|
||||
|
||||
Same root pattern works with Pomerium, Authelia, Caddy `forward_auth`, Apache `mod_auth_openidc`, or any service-mesh `ext_authz`. See [`../../../docs/architecture.md`](../../../docs/architecture.md) "Authenticating-gateway pattern" for the full design rationale and [`../../../docs/upgrade-to-v2-jwt-removal.md`](../../../docs/upgrade-to-v2-jwt-removal.md) for the migration walkthrough.
|
||||
|
||||
### TLS certificate sourcing
|
||||
|
||||
By default the chart provisions a self-signed cert via the same init-container pattern as the docker-compose deploy. For production, supply an operator-managed Secret (cert-manager, internal CA, etc.) — see [`docs/tls.md`](../../../docs/tls.md) for the full provisioning matrix and [`docs/upgrade-to-tls.md`](../../../docs/upgrade-to-tls.md) for upgrade-from-HTTP procedures.
|
||||
|
||||
## Disabling embedded postgres
|
||||
|
||||
If you have an existing PostgreSQL cluster, disable the embedded one and point at it directly:
|
||||
|
||||
```bash
|
||||
helm install certctl deploy/helm/certctl/ \
|
||||
--set postgresql.enabled=false \
|
||||
--set server.databaseUrl='postgres://certctl:<pw>@my-pg-host:5432/certctl?sslmode=require'
|
||||
```
|
||||
|
||||
The volume-trap section above does **not** apply to this configuration — your postgres operator (or cloud DB) handles password rotation, and you control `pg_authid` directly.
|
||||
|
||||
## Uninstall
|
||||
|
||||
```bash
|
||||
helm uninstall <release> -n certctl
|
||||
# Optional — also delete the postgres PVC (DESTROYS DATA):
|
||||
kubectl -n certctl delete pvc -l \
|
||||
app.kubernetes.io/name=certctl,app.kubernetes.io/component=postgres
|
||||
```
|
||||
|
||||
By default `helm uninstall` retains the StatefulSet's PVCs, so reinstalling with the same release name preserves the database. If you've changed `postgresql.auth.password` in your values between uninstall and reinstall, you'll hit the trap on the reinstall — apply the non-destructive remediation above, or also delete the PVC.
|
||||
@@ -169,3 +169,26 @@ per affected resource. No-op when configured correctly.
|
||||
{{- fail "\n\nserver.tls.certManager.enabled=true but server.tls.certManager.issuerRef.name is empty.\n\nSet:\n --set server.tls.certManager.issuerRef.name=<your-issuer-or-clusterissuer>\n\nSee docs/tls.md.\n" -}}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Auth-type validation gate.
|
||||
|
||||
G-1 (P1): pre-G-1 the chart accepted server.auth.type=jwt and the
|
||||
certctl-server container silently routed every request through the
|
||||
api-key bearer middleware (no JWT impl ships with certctl). Post-G-1
|
||||
the chart fails at template-time with a pointer at the authenticating-
|
||||
gateway pattern. The valid set must stay in sync with
|
||||
internal/config.ValidAuthTypes() in the Go binary; if you add a value
|
||||
there you must add it here too (and update the property test in
|
||||
internal/config/config_test.go that pins both surfaces).
|
||||
|
||||
Any template that consumes .Values.server.auth.type should call
|
||||
`{{ include "certctl.validateAuthType" . }}` at the top so this guard
|
||||
runs once per affected resource. No-op when configured correctly.
|
||||
*/}}
|
||||
{{- define "certctl.validateAuthType" -}}
|
||||
{{- $valid := list "api-key" "none" -}}
|
||||
{{- if not (has .Values.server.auth.type $valid) -}}
|
||||
{{- fail (printf "\n\nserver.auth.type=%q is not supported (valid: %v).\n\nFor JWT/OIDC, run an authenticating gateway in front of certctl\n(oauth2-proxy / Envoy ext_authz / Traefik ForwardAuth / Pomerium) and\nset server.auth.type=none here so the gateway terminates federated\nidentity. See docs/architecture.md \"Authenticating-gateway pattern\"\nand docs/upgrade-to-v2-jwt-removal.md for the migration walkthrough.\n\nG-1 audit closure: pre-G-1 the chart accepted type=jwt and the binary\nsilently downgraded to api-key middleware. The chart now fails at\ntemplate time so misconfigured deployments cannot ship.\n" .Values.server.auth.type $valid) -}}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
{{- include "certctl.validateAuthType" . }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
{{- include "certctl.tls.required" . }}
|
||||
{{- include "certctl.validateAuthType" . }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
{{- include "certctl.validateAuthType" . }}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
|
||||
@@ -48,7 +48,14 @@ server:
|
||||
drop:
|
||||
- ALL
|
||||
|
||||
# Liveness and readiness probes (HTTPS-only as of v2.2)
|
||||
# Liveness and readiness probes (HTTPS-only as of v2.2).
|
||||
#
|
||||
# The two paths exposed for probes are `/health` and `/ready` —
|
||||
# registered in internal/api/router/router.go:76-85 and bypassing the
|
||||
# auth middleware via the no-auth list at cmd/server/main.go:920.
|
||||
# Both serve the same JSON shape today (`{"status":"healthy"}` /
|
||||
# `{"status":"ready"}`) but exist as separate routes so liveness and
|
||||
# readiness can diverge in the future without renaming.
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
@@ -59,9 +66,18 @@ server:
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
|
||||
# U-2 (P1, cat-u-healthcheck_protocol_mismatch — adjacent fix): pre-U-2
|
||||
# the readiness probe pointed at `/readyz`, the conventional kube-flavor
|
||||
# name. The certctl server doesn't register `/readyz` (only `/health`
|
||||
# and `/ready`) — see cmd/server/main.go:920 and
|
||||
# internal/api/router/router.go:81. K8s readiness probes therefore
|
||||
# received a 404 (or, with auth enabled, a 401 from the api-key middleware
|
||||
# because `/readyz` was NOT in the no-auth bypass set), pods stayed
|
||||
# `NotReady` indefinitely, and Helm rollouts stalled. Post-U-2 the path
|
||||
# matches a registered route.
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /readyz
|
||||
path: /ready
|
||||
port: https
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 5
|
||||
@@ -112,10 +128,23 @@ server:
|
||||
port: 8443
|
||||
annotations: {}
|
||||
|
||||
# Authentication configuration
|
||||
# Authentication configuration.
|
||||
# Valid types: "api-key" (production) or "none" (demo only — disables
|
||||
# authentication on the API and logs a loud Warn at server startup).
|
||||
# For JWT/OIDC, run an authenticating gateway in front of certctl
|
||||
# (oauth2-proxy / Envoy ext_authz / Traefik ForwardAuth / Pomerium)
|
||||
# and set type=none here so the gateway terminates federated identity.
|
||||
# See docs/architecture.md "Authenticating-gateway pattern".
|
||||
#
|
||||
# G-1 (P1): pre-G-1 the chart accepted server.auth.type=jwt and the
|
||||
# certctl-server container silently routed every request through the
|
||||
# api-key bearer middleware — silent auth downgrade. Post-G-1 the
|
||||
# chart's `certctl.validateAuthType` template helper rejects any value
|
||||
# outside {api-key, none} at template time. See
|
||||
# docs/upgrade-to-v2-jwt-removal.md if you previously set type=jwt.
|
||||
auth:
|
||||
type: api-key # Options: api-key, none (for demo only)
|
||||
apiKey: "" # REQUIRED in production - set via --set or values override
|
||||
type: api-key
|
||||
apiKey: "" # REQUIRED when type=api-key (set via --set or values override).
|
||||
|
||||
# Logging configuration
|
||||
logging:
|
||||
@@ -260,7 +289,30 @@ postgresql:
|
||||
auth:
|
||||
database: certctl
|
||||
username: certctl
|
||||
password: "" # REQUIRED - set via --set or values override
|
||||
# REQUIRED — set via `--set postgresql.auth.password=<value>` or values override.
|
||||
#
|
||||
# WARNING (U-1): rotating this value after first deploy does NOT change the
|
||||
# database password. The `postgres:16-alpine` image runs `initdb` only when
|
||||
# /var/lib/postgresql/data is empty, so POSTGRES_PASSWORD is written into
|
||||
# pg_authid exactly once — on the first boot of the StatefulSet's PVC.
|
||||
# Subsequent rollouts pick up the new env value in the postgres container
|
||||
# but the certctl-server container's CERTCTL_DATABASE_URL also picks up
|
||||
# the new value, while pg_authid still expects the old one — leading to
|
||||
# `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01).
|
||||
#
|
||||
# The certctl-server emits guidance via internal/repository/postgres/db.go::
|
||||
# wrapPingError when it sees SQLSTATE 28P01 at startup. To resolve in a
|
||||
# Helm deployment:
|
||||
# - Non-destructive (preferred for environments with data):
|
||||
# kubectl exec -it <release>-postgres-0 -- \
|
||||
# psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"
|
||||
# then update the secret/values to match and let the certctl-server
|
||||
# pod restart against the matching credential.
|
||||
# - Destructive (DESTROYS DATA — only acceptable on dev/demo PVCs):
|
||||
# helm uninstall <release> && \
|
||||
# kubectl delete pvc -l app.kubernetes.io/name=certctl,app.kubernetes.io/component=postgres && \
|
||||
# helm install <release> ... # PVC re-creates empty, initdb seeds new password
|
||||
password: ""
|
||||
|
||||
# Storage configuration
|
||||
storage:
|
||||
|
||||
@@ -0,0 +1,216 @@
|
||||
//go:build integration
|
||||
|
||||
// Package integration_test — image-level HEALTHCHECK contract.
|
||||
//
|
||||
// U-2 (P1, cat-u-healthcheck_protocol_mismatch): pre-U-2 the published
|
||||
// server image's Dockerfile HEALTHCHECK called `curl -f http://localhost:
|
||||
// 8443/health` against an HTTPS-only listener (HTTPS-Everywhere milestone,
|
||||
// v2.2 / tag v2.0.47). Operators outside docker-compose / Helm saw the
|
||||
// container reported as `unhealthy` indefinitely. The compose stack
|
||||
// overrode this HEALTHCHECK with `--cacert + https://`; the Helm chart
|
||||
// uses explicit `httpGet` probes that ignore Docker's HEALTHCHECK; the 5
|
||||
// example compose files all override with `curl -sfk https://localhost:
|
||||
// 8443/health`. So the observable failure was scoped to bare `docker run`
|
||||
// / Docker Swarm / Nomad / ECS users — exactly the "I just pulled the
|
||||
// published image" path.
|
||||
//
|
||||
// This file's tests pin the contract at the binary-image level. The
|
||||
// matching CI grep guardrail in .github/workflows/ci.yml catches the
|
||||
// regression at the Dockerfile-source level; both layers are needed
|
||||
// because someone could replace the HEALTHCHECK line with a sibling
|
||||
// broken pattern that the grep doesn't catch (e.g., a TCP-only check
|
||||
// against the HTTPS port).
|
||||
//
|
||||
// Run alongside the rest of the integration suite:
|
||||
//
|
||||
// cd deploy/test && go test -tags integration -v -run Healthcheck
|
||||
//
|
||||
// The tests skip cleanly with t.Skip when docker is not available
|
||||
// (CI without docker-in-docker, sandbox environments, etc.) so they
|
||||
// don't block local development on machines without docker.
|
||||
package integration_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// dockerAvailable returns true when `docker version` returns 0.
|
||||
// We cache it across tests in this file so the skip message prints once.
|
||||
func dockerAvailable(t *testing.T) bool {
|
||||
t.Helper()
|
||||
cmd := exec.Command("docker", "version", "--format", "{{.Server.Version}}")
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
t.Logf("docker not available: %v\noutput: %s", err, string(out))
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// dockerCmd runs `docker <args...>` with a 60s budget, returning stdout
|
||||
// + stderr combined and the exit error if any. Used for short-lived
|
||||
// probes (inspect, build, run -d).
|
||||
func dockerCmd(t *testing.T, timeout time.Duration, args ...string) (string, error) {
|
||||
t.Helper()
|
||||
cmd := exec.Command("docker", args...)
|
||||
done := make(chan struct{})
|
||||
var out []byte
|
||||
var err error
|
||||
go func() {
|
||||
out, err = cmd.CombinedOutput()
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
return string(out), err
|
||||
case <-time.After(timeout):
|
||||
_ = cmd.Process.Kill()
|
||||
t.Fatalf("docker %v timed out after %v", args, timeout)
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
// TestPublishedServerImage_HealthcheckSpecUsesHTTPS performs the Dockerfile-
|
||||
// source-level shipped-shape pin: the inspected image's Healthcheck.Test
|
||||
// array MUST contain "https://localhost:8443/health" (and MUST NOT
|
||||
// contain "http://localhost:8443/health"). This is the lightweight half
|
||||
// of the contract — it doesn't require running the container, only
|
||||
// building it. It catches the audit-flagged bug directly.
|
||||
func TestPublishedServerImage_HealthcheckSpecUsesHTTPS(t *testing.T) {
|
||||
if !dockerAvailable(t) {
|
||||
t.Skip("docker not available — skipping image-level HEALTHCHECK test")
|
||||
}
|
||||
|
||||
const imgTag = "certctl-u2-healthcheck-spec-test"
|
||||
t.Cleanup(func() {
|
||||
_, _ = dockerCmd(t, 30*time.Second, "rmi", "-f", imgTag)
|
||||
})
|
||||
|
||||
// Build the server image. Use the repo root as context (this test
|
||||
// file lives at deploy/test/, the Dockerfile at the repo root).
|
||||
buildOut, err := dockerCmd(t, 5*time.Minute,
|
||||
"build", "-f", "../../Dockerfile", "-t", imgTag, "../..")
|
||||
if err != nil {
|
||||
t.Fatalf("docker build failed: %v\noutput:\n%s", err, buildOut)
|
||||
}
|
||||
|
||||
// Inspect the shipped HEALTHCHECK metadata.
|
||||
inspectOut, err := dockerCmd(t, 30*time.Second,
|
||||
"inspect", "--format", "{{json .Config.Healthcheck}}", imgTag)
|
||||
if err != nil {
|
||||
t.Fatalf("docker inspect failed: %v\noutput:\n%s", err, inspectOut)
|
||||
}
|
||||
|
||||
var hc struct {
|
||||
Test []string
|
||||
Interval int64
|
||||
Timeout int64
|
||||
}
|
||||
if err := json.Unmarshal([]byte(strings.TrimSpace(inspectOut)), &hc); err != nil {
|
||||
t.Fatalf("could not parse Healthcheck JSON %q: %v", inspectOut, err)
|
||||
}
|
||||
|
||||
joined := strings.Join(hc.Test, " ")
|
||||
|
||||
// Positive contract.
|
||||
if !strings.Contains(joined, "https://localhost:8443/health") {
|
||||
t.Errorf("Healthcheck.Test does not target https://localhost:8443/health\nfull: %v", hc.Test)
|
||||
}
|
||||
|
||||
// Negative contract — pre-U-2 regression shape MUST be absent.
|
||||
if strings.Contains(joined, "http://localhost:8443/health") {
|
||||
t.Errorf("Healthcheck.Test still contains the pre-U-2 plaintext shape: %v", hc.Test)
|
||||
}
|
||||
|
||||
// `-k` (or `--insecure`) must be present because the bootstrap cert
|
||||
// is per-deploy and the published image can't pin a CA bundle —
|
||||
// see the U-2 closure docblock on Dockerfile and the audit doc.
|
||||
if !strings.Contains(joined, "-k") && !strings.Contains(joined, "--insecure") {
|
||||
t.Errorf("Healthcheck.Test omits -k / --insecure flag (required for self-signed bootstrap probe): %v", hc.Test)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPublishedAgentImage_HealthcheckSpecExists pins the U-2 adjacent
|
||||
// fix that added a HEALTHCHECK to the agent image. Pre-U-2 the agent
|
||||
// image had no HEALTHCHECK declaration, so bare-`docker run` agents got
|
||||
// `none` health status from Docker. Post-U-2 the agent uses pgrep to
|
||||
// verify the process is alive (mirroring the docker-compose pattern at
|
||||
// deploy/docker-compose.yml:173, which also became reliable post-U-2
|
||||
// because procps is now installed in the runtime image).
|
||||
func TestPublishedAgentImage_HealthcheckSpecExists(t *testing.T) {
|
||||
if !dockerAvailable(t) {
|
||||
t.Skip("docker not available — skipping image-level HEALTHCHECK test")
|
||||
}
|
||||
|
||||
const imgTag = "certctl-u2-agent-healthcheck-spec-test"
|
||||
t.Cleanup(func() {
|
||||
_, _ = dockerCmd(t, 30*time.Second, "rmi", "-f", imgTag)
|
||||
})
|
||||
|
||||
buildOut, err := dockerCmd(t, 5*time.Minute,
|
||||
"build", "-f", "../../Dockerfile.agent", "-t", imgTag, "../..")
|
||||
if err != nil {
|
||||
t.Fatalf("docker build failed: %v\noutput:\n%s", err, buildOut)
|
||||
}
|
||||
|
||||
inspectOut, err := dockerCmd(t, 30*time.Second,
|
||||
"inspect", "--format", "{{json .Config.Healthcheck}}", imgTag)
|
||||
if err != nil {
|
||||
t.Fatalf("docker inspect failed: %v\noutput:\n%s", err, inspectOut)
|
||||
}
|
||||
|
||||
trimmed := strings.TrimSpace(inspectOut)
|
||||
if trimmed == "null" || trimmed == "" {
|
||||
t.Fatalf("agent image has no HEALTHCHECK (got %q) — U-2 adjacent fix regressed", inspectOut)
|
||||
}
|
||||
|
||||
var hc struct {
|
||||
Test []string
|
||||
}
|
||||
if err := json.Unmarshal([]byte(trimmed), &hc); err != nil {
|
||||
t.Fatalf("could not parse Healthcheck JSON %q: %v", inspectOut, err)
|
||||
}
|
||||
|
||||
joined := strings.Join(hc.Test, " ")
|
||||
if !strings.Contains(joined, "pgrep") {
|
||||
t.Errorf("agent Healthcheck.Test does not use pgrep (lost the process-presence shape): %v", hc.Test)
|
||||
}
|
||||
if !strings.Contains(joined, "certctl-agent") {
|
||||
t.Errorf("agent Healthcheck.Test does not target the certctl-agent process name: %v", hc.Test)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPublishedServerImage_HealthcheckTransitionsToHealthy is the
|
||||
// runtime-level contract: the built image, when started, must transition
|
||||
// to `healthy` within the start-period + 30s observability budget. This
|
||||
// is the heavy test — it requires the server to actually start, which
|
||||
// in turn requires either a reachable database OR a startup that fails
|
||||
// gracefully enough to keep the HEALTHCHECK probe target alive.
|
||||
//
|
||||
// The container is started with CERTCTL_DATABASE_URL pointing at an
|
||||
// unreachable host so the server fails its postgres bring-up — but
|
||||
// importantly, fails AFTER the TLS listener has come up, because the
|
||||
// HEALTHCHECK probe target is the TLS listener. We don't actually need
|
||||
// the database to validate the HEALTHCHECK shape.
|
||||
//
|
||||
// IMPORTANT: this test is the runtime contract. If you're working on the
|
||||
// server's startup ordering and the listener now comes up AFTER the
|
||||
// database, this test must adapt — start a sidecar postgres via
|
||||
// testcontainers-go (see internal/integration/lifecycle_test.go for the
|
||||
// pattern) and connect the certctl-server container to it.
|
||||
func TestPublishedServerImage_HealthcheckTransitionsToHealthy(t *testing.T) {
|
||||
if !dockerAvailable(t) {
|
||||
t.Skip("docker not available — skipping runtime HEALTHCHECK test")
|
||||
}
|
||||
if testing.Short() {
|
||||
t.Skip("runtime HEALTHCHECK test takes ~45s; skipping under -short")
|
||||
}
|
||||
t.Skip("runtime probe contract not yet wired to a sidecar postgres; " +
|
||||
"image-spec contract above (TestPublishedServerImage_HealthcheckSpecUsesHTTPS) " +
|
||||
"covers the audit-flagged regression. Re-enable once the integration " +
|
||||
"harness provisions postgres for image-level smoke.")
|
||||
}
|
||||
+12
-1
@@ -348,6 +348,11 @@ erDiagram
|
||||
}
|
||||
```
|
||||
|
||||
The ER diagram above documents **database shape**, not REST-API wire shape. Several columns are intentionally server-internal and never serialized to clients:
|
||||
|
||||
- `agents.api_key_hash` — SHA-256 of the agent's plaintext API key, populated by `service.RegisterAgent` (`hashAPIKey(apiKey)` at `internal/service/agent.go`) and consumed by `repository.AgentRepository::GetByAPIKey` for the auth-lookup. **Not** exposed via the REST API, **not** echoed via CLI / MCP / agent registration response, **never** logged. Enforced by `internal/domain/connector.go::Agent.MarshalJSON` (G-2 audit closure, `cat-s5-apikey_leak`); the OpenAPI Agent schema explicitly excludes the field, the frontend `Agent` interface omits it, and a CI grep guardrail at `.github/workflows/ci.yml` blocks reintroduction.
|
||||
- `issuers.config` / `deployment_targets.config` — plaintext jsonb shadow of the AES-GCM-encrypted on-disk blob; the encrypted form lives on `EncryptedConfig []byte` (Go-only field tagged `json:"-"`).
|
||||
|
||||
Migrations are idempotent (`IF NOT EXISTS` on all CREATE statements, `ON CONFLICT (id) DO NOTHING` on all seed data) so they're safe to run multiple times — important for Docker Compose where both initdb and the server may run the same SQL.
|
||||
|
||||
## Data Flow: Certificate Lifecycle
|
||||
@@ -891,9 +896,15 @@ The HTTP middleware stack processes requests in the following order (see `cmd/se
|
||||
4. **BodyLimit** - request body size cap via `http.MaxBytesReader`
|
||||
5. **RateLimiter** - token bucket rate limiting (optional, when enabled)
|
||||
6. **CORS** - cross-origin request handling (deny-by-default)
|
||||
7. **Auth** - API key or JWT validation
|
||||
7. **Auth** - API key validation (or none in development; JWT/OIDC via authenticating gateway, see below — not in-process)
|
||||
8. **AuditLog** - records every API call to the audit trail (requires auth context for actor)
|
||||
|
||||
### Authenticating-gateway pattern (JWT, OIDC, mTLS)
|
||||
|
||||
certctl's in-process authentication surface is intentionally narrow: `api-key` for production deployments and `none` for development. There is no in-process JWT, OIDC, mTLS, or SAML middleware. (`CERTCTL_AUTH_TYPE=jwt` was accepted pre-G-1 but silently routed through the api-key bearer middleware — a security finding masquerading as a config option, removed at the v2.x boundary; see [`upgrade-to-v2-jwt-removal.md`](upgrade-to-v2-jwt-removal.md) if you previously set it.)
|
||||
|
||||
For deployments that need JWT/OIDC/mTLS, the standard pattern is to put an authenticating gateway in front of certctl and configure `CERTCTL_AUTH_TYPE=none` on the upstream certctl process. The gateway terminates the federated identity protocol, validates tokens / certificates / SAML assertions, and proxies the authenticated request to certctl as a same-origin call on a private network. This separation gives operators the full breadth of the modern identity ecosystem (oauth2-proxy, Envoy `ext_authz`, Traefik `ForwardAuth`, Pomerium, Authelia, Caddy `forward_auth`, Apache `mod_auth_openidc`, nginx `auth_request`) without certctl itself having to track signing-key rotation, claim mapping, audience validation, and the rest of the JWT/OIDC surface area. Operators wanting per-request actor attribution past the gateway boundary forward the gateway-resolved identity (e.g., `X-Auth-Request-User` from oauth2-proxy) and run a small authorization layer at the gateway that enforces the bearer-key contract certctl actually uses.
|
||||
|
||||
### Concurrency Safety
|
||||
|
||||
The background scheduler uses `sync/atomic.Bool` idempotency guards on every loop (8 always-on plus up to 4 optional) — if a tick fires while the previous iteration is still running, it skips. A `sync.WaitGroup` tracks all in-flight goroutines. `WaitForCompletion(timeout)` blocks during shutdown until all work finishes or the timeout expires, preventing state corruption from mid-flight database operations during process exit.
|
||||
|
||||
+32
-15
@@ -1141,13 +1141,30 @@ API Endpoints:
|
||||
- **`GET /api/v1/digest/preview`** — Render digest HTML for preview (no email sent)
|
||||
- **`POST /api/v1/digest/send`** — Trigger digest send immediately (outside of schedule)
|
||||
|
||||
> **Note (HTTPS-only as of v2.2):** The `curl` examples in this section
|
||||
> and below all target the HTTPS-only control plane. Extract the
|
||||
> docker-compose self-signed bootstrap CA bundle once and reuse it on
|
||||
> every call:
|
||||
>
|
||||
> ```bash
|
||||
> export CA=/tmp/certctl-ca.crt
|
||||
> docker compose -f deploy/docker-compose.yml exec -T certctl-server \
|
||||
> cat /etc/certctl/tls/ca.crt > "$CA"
|
||||
> ```
|
||||
>
|
||||
> Then pass `--cacert "$CA"` (or `-k` for one-off smoke tests, never in
|
||||
> production). The same pattern is documented in
|
||||
> [`quickstart.md`](quickstart.md). Pre-U-2 these examples used `http://`
|
||||
> and silently failed against the HTTPS listener; post-U-2 they speak
|
||||
> HTTPS with the operator-managed CA bundle.
|
||||
|
||||
Example:
|
||||
```bash
|
||||
# Preview digest
|
||||
curl http://localhost:8443/api/v1/digest/preview | jq '.html'
|
||||
curl --cacert "$CA" https://localhost:8443/api/v1/digest/preview | jq '.html'
|
||||
|
||||
# Send digest immediately
|
||||
curl -X POST http://localhost:8443/api/v1/digest/send
|
||||
curl --cacert "$CA" -X POST https://localhost:8443/api/v1/digest/send
|
||||
```
|
||||
|
||||
Each notifier is enabled by its configuration env var:
|
||||
@@ -1294,24 +1311,24 @@ The agent scans these directories on startup and every 6 hours, looking for cert
|
||||
|
||||
```bash
|
||||
# List discovered certificates (filter by agent, status)
|
||||
curl -s "http://localhost:8443/api/v1/discovered-certificates?agent_id=agent-nginx-01&status=new" | jq .
|
||||
curl --cacert "$CA" -s "https://localhost:8443/api/v1/discovered-certificates?agent_id=agent-nginx-01&status=new" | jq .
|
||||
|
||||
# Get discovery detail
|
||||
curl -s http://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID | jq .
|
||||
|
||||
# Claim a discovered cert (link to managed certificate)
|
||||
curl -s -X POST http://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID/claim \
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID/claim \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"managed_certificate_id": "mc-api-prod"}' | jq .
|
||||
|
||||
# Dismiss a discovery
|
||||
curl -s -X POST http://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID/dismiss | jq .
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID/dismiss | jq .
|
||||
|
||||
# View discovery scan history
|
||||
curl -s http://localhost:8443/api/v1/discovery-scans | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/discovery-scans | jq .
|
||||
|
||||
# Summary counts (new, claimed, dismissed)
|
||||
curl -s http://localhost:8443/api/v1/discovery-summary | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/discovery-summary | jq .
|
||||
```
|
||||
|
||||
### Use Cases
|
||||
@@ -1340,7 +1357,7 @@ Network scan targets can be managed from the **Network Scans** dashboard page (c
|
||||
|
||||
```bash
|
||||
# Create a scan target for your internal network (or use the dashboard's "+ New Target" button)
|
||||
curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/network-scan-targets \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "Production Web Servers",
|
||||
@@ -1365,26 +1382,26 @@ curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \
|
||||
|
||||
```bash
|
||||
# List all scan targets
|
||||
curl -s http://localhost:8443/api/v1/network-scan-targets | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/network-scan-targets | jq .
|
||||
|
||||
# Create a scan target
|
||||
curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/network-scan-targets \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "DMZ", "cidrs": ["172.16.0.0/24"], "ports": [443]}' | jq .
|
||||
|
||||
# Get a specific target (includes last_scan_at, last_scan_certs_found)
|
||||
curl -s http://localhost:8443/api/v1/network-scan-targets/nst-dmz | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/network-scan-targets/nst-dmz | jq .
|
||||
|
||||
# Trigger an immediate scan (doesn't wait for scheduler)
|
||||
curl -s -X POST http://localhost:8443/api/v1/network-scan-targets/nst-dmz/scan | jq .
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/network-scan-targets/nst-dmz/scan | jq .
|
||||
|
||||
# Update scan configuration
|
||||
curl -s -X PUT http://localhost:8443/api/v1/network-scan-targets/nst-dmz \
|
||||
curl --cacert "$CA" -s -X PUT https://localhost:8443/api/v1/network-scan-targets/nst-dmz \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"ports": [443, 8443, 9443], "timeout_ms": 3000}' | jq .
|
||||
|
||||
# Delete a scan target
|
||||
curl -s -X DELETE http://localhost:8443/api/v1/network-scan-targets/nst-dmz
|
||||
curl --cacert "$CA" -s -X DELETE https://localhost:8443/api/v1/network-scan-targets/nst-dmz
|
||||
```
|
||||
|
||||
### Scheduler Integration
|
||||
|
||||
@@ -60,6 +60,8 @@ cp deploy/.env.example deploy/.env
|
||||
docker compose -f deploy/docker-compose.yml up -d --build
|
||||
```
|
||||
|
||||
> **Warning:** Edit `POSTGRES_PASSWORD` *before* the very first `docker compose up`. Postgres seeds the password into its data directory only on first boot of an empty volume — after that, the password is baked into `pg_authid` and the env var is ignored. If you boot once with the default and later change `POSTGRES_PASSWORD` in `.env`, the certctl-server container picks up the new value but postgres still authenticates against the old one, and the server logs `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). Two ways out: tear down the volume with `docker compose -f deploy/docker-compose.yml down -v` (this **deletes all data**) and bring up fresh, or rotate non-destructively with `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"` and then restart certctl-server with the matching `POSTGRES_PASSWORD`.
|
||||
|
||||
### Docker Compose Environments
|
||||
|
||||
The `deploy/` directory contains four compose files for different use cases:
|
||||
|
||||
@@ -0,0 +1,155 @@
|
||||
# Upgrading past G-1 — `CERTCTL_AUTH_TYPE=jwt` removal
|
||||
|
||||
If your certctl deployment currently sets `CERTCTL_AUTH_TYPE=jwt` (or `server.auth.type=jwt` in Helm), the next certctl upgrade will fail-fast at startup with a dedicated diagnostic. This guide explains why, what to switch to, and how to keep JWT/OIDC at your edge.
|
||||
|
||||
For everyone else — operators running `api-key` or `none` — this upgrade is a no-op. Skip to [`upgrade-to-tls.md`](upgrade-to-tls.md) for the v2.2 HTTPS-everywhere migration if you haven't done that one yet.
|
||||
|
||||
## Why we removed it
|
||||
|
||||
Pre-G-1, the config validator at `internal/config/config.go` accepted three values for `CERTCTL_AUTH_TYPE`: `api-key`, `jwt`, and `none`. The startup log line at `cmd/server/main.go` faithfully echoed `"authentication enabled" "type"="jwt"` when an operator picked `jwt`. Reasonable people read that and concluded JWT auth was on.
|
||||
|
||||
It wasn't. Grep `internal/ cmd/` for `NewJWT`, `JWTMiddleware`, or `jwt.Parse` — pre-G-1, there were zero matches in production code. The auth-middleware wiring at `cmd/server/main.go:653` unconditionally called `middleware.NewAuthWithNamedKeys(namedKeys)` regardless of `cfg.Auth.Type`. So `CERTCTL_AUTH_TYPE=jwt` just routed every request through the api-key bearer middleware, comparing the incoming `Authorization: Bearer <something>` against whatever string the operator put in `CERTCTL_AUTH_SECRET`. Real JWT clients got 401 (the api-key middleware saw the JWT string as a literal token and compared bytes). Operators who treated `CERTCTL_AUTH_SECRET` as a JWT signing secret (and therefore handled it less carefully than an api-key) handed an attacker an api-key. Silent auth downgrade — a security finding masquerading as a config option.
|
||||
|
||||
We chose to remove the option rather than implement JWT middleware. Implementing real JWT/OIDC requires jwks vs static-secret rotation, claim mapping (which claim is the actor / the admin flag?), expiry enforcement, audience and issuer validation, key rollover semantics, and regression coverage at the same depth as the existing api-key path. That's a feature, not a fix. The audit-recommended structural fix — and the one that actually closes the hazard — is to fail loudly instead of silently downgrading.
|
||||
|
||||
## What changes at startup
|
||||
|
||||
Post-G-1, a binary started with `CERTCTL_AUTH_TYPE=jwt` exits non-zero before opening the listener:
|
||||
|
||||
```
|
||||
Failed to load configuration: CERTCTL_AUTH_TYPE=jwt is no longer accepted
|
||||
(G-1 silent auth downgrade): no JWT middleware ships with certctl. To use
|
||||
JWT/OIDC, run an authenticating gateway (oauth2-proxy / Envoy ext_authz /
|
||||
Traefik ForwardAuth / Pomerium) in front of certctl and set
|
||||
CERTCTL_AUTH_TYPE=none on the upstream. See docs/architecture.md
|
||||
"Authenticating-gateway pattern" and docs/upgrade-to-v2-jwt-removal.md
|
||||
for the migration walkthrough
|
||||
```
|
||||
|
||||
Helm operators get the same shape at `helm install` / `helm upgrade` template time: `server.auth.type=jwt` is rejected by the chart's `certctl.validateAuthType` template helper before any Kubernetes object is rendered.
|
||||
|
||||
The CI-side regression guard at `.github/workflows/ci.yml` blocks any future PR that re-introduces `"jwt"` as an auth-type literal in production code or spec.
|
||||
|
||||
## Recovery — pick one
|
||||
|
||||
### Option A — switch to `api-key` (you weren't actually using JWT)
|
||||
|
||||
If your `CERTCTL_AUTH_SECRET` was a single high-entropy token and your clients sent it as `Authorization: Bearer <token>`, you were already using api-key auth — you just had `CERTCTL_AUTH_TYPE` set to the wrong string. Flip it:
|
||||
|
||||
```
|
||||
# .env (docker-compose)
|
||||
CERTCTL_AUTH_TYPE=api-key
|
||||
CERTCTL_AUTH_SECRET=<your-existing-token>
|
||||
```
|
||||
|
||||
```
|
||||
# Helm
|
||||
helm upgrade <release> deploy/helm/certctl/ \
|
||||
--reuse-values \
|
||||
--set server.auth.type=api-key \
|
||||
--set server.auth.apiKey=<your-existing-token>
|
||||
```
|
||||
|
||||
No client changes needed — the same Bearer token continues to work. The startup log will now read `"authentication enabled" "type"="api-key"`, which matches what was actually happening pre-G-1.
|
||||
|
||||
### Option B — front certctl with an authenticating gateway
|
||||
|
||||
If you genuinely need JWT, OIDC, mTLS, or SAML, run an authenticating gateway in front of certctl and let the gateway terminate the federated identity protocol. Configure certctl for `CERTCTL_AUTH_TYPE=none`:
|
||||
|
||||
```
|
||||
CERTCTL_AUTH_TYPE=none
|
||||
```
|
||||
|
||||
Then put an oauth2-proxy / Envoy `ext_authz` / Traefik `ForwardAuth` / Pomerium / Authelia (etc.) in the network path between operators and certctl. The gateway validates the identity and proxies the authenticated request to certctl as a same-origin call on a private network.
|
||||
|
||||
### Concrete walkthrough — oauth2-proxy + certctl on docker-compose
|
||||
|
||||
This is the simplest production-grade JWT/OIDC shape. It assumes you have an OIDC provider (Okta, Auth0, Google Workspace, Keycloak, Dex) and a registered client_id / client_secret.
|
||||
|
||||
```yaml
|
||||
# deploy/docker-compose.gateway.yml — overlay on the base compose file
|
||||
services:
|
||||
oauth2-proxy:
|
||||
image: quay.io/oauth2-proxy/oauth2-proxy:latest
|
||||
command:
|
||||
- --provider=oidc
|
||||
- --oidc-issuer-url=https://<your-issuer>/
|
||||
- --client-id=${OIDC_CLIENT_ID}
|
||||
- --client-secret=${OIDC_CLIENT_SECRET}
|
||||
- --cookie-secret=${OAUTH2_PROXY_COOKIE_SECRET} # openssl rand -base64 32
|
||||
- --upstream=http://certctl-server:8443 # internal-network only; certctl listens on 8443
|
||||
- --http-address=0.0.0.0:4180
|
||||
- --email-domain=*
|
||||
- --pass-access-token=true
|
||||
- --pass-authorization-header=true
|
||||
- --set-authorization-header=true # forwards a bearer token upstream
|
||||
- --skip-provider-button=true
|
||||
- --reverse-proxy=true
|
||||
ports:
|
||||
- "443:4180"
|
||||
depends_on:
|
||||
- certctl-server
|
||||
networks:
|
||||
- certctl-network
|
||||
|
||||
certctl-server:
|
||||
environment:
|
||||
CERTCTL_AUTH_TYPE: none # gateway terminates auth — see docs/upgrade-to-v2-jwt-removal.md
|
||||
# ... rest of the certctl env block unchanged
|
||||
```
|
||||
|
||||
Operators hit `https://<your-host>/`, get redirected through the OIDC provider, land back at oauth2-proxy with a session cookie, and oauth2-proxy proxies their request to certctl on the internal Docker network. certctl itself is HTTPS-only on `:8443` (TLS 1.3, see [`tls.md`](tls.md)) but operator browsers never see that hop directly. Bind certctl-server's `:8443` to the internal Docker network only — do NOT publish it to the host. The audit trail will record the actor as the gateway-forwarded identity if you also configure a small bearer-token-mapping shim at the gateway (most production deployments do this with a per-user api-key issued by the gateway after OIDC validation).
|
||||
|
||||
### Traefik ForwardAuth pattern (Kubernetes)
|
||||
|
||||
Same shape, kubernetes-flavored:
|
||||
|
||||
```yaml
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: oidc-forward-auth
|
||||
spec:
|
||||
forwardAuth:
|
||||
address: http://oauth2-proxy.auth.svc.cluster.local:4180
|
||||
trustForwardHeader: true
|
||||
authResponseHeaders:
|
||||
- X-Auth-Request-User
|
||||
- X-Auth-Request-Email
|
||||
- Authorization
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: certctl
|
||||
spec:
|
||||
routes:
|
||||
- match: Host(`certctl.example.com`)
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: oidc-forward-auth
|
||||
services:
|
||||
- name: certctl-server
|
||||
port: 8443
|
||||
```
|
||||
|
||||
The certctl Helm release runs with `server.auth.type=none`. The Traefik IngressRoute attaches `oidc-forward-auth` as a middleware so every request is OIDC-validated by oauth2-proxy before reaching certctl.
|
||||
|
||||
### Envoy `ext_authz` pattern
|
||||
|
||||
For service-mesh deployments (Istio, Consul, plain Envoy), the `ext_authz` filter calls out to an external authorization service per-request. Same outcome: certctl runs `CERTCTL_AUTH_TYPE=none` and Envoy + your authz service handle JWT/OIDC/mTLS at the mesh edge. See the [Envoy ext_authz docs](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_authz_filter) for the configuration surface.
|
||||
|
||||
## Rollback
|
||||
|
||||
Pre-G-1 binaries silently accepted `CERTCTL_AUTH_TYPE=jwt` and routed through the api-key middleware. Downgrading the binary is the only mechanical rollback path, and it puts you back into the silent-downgrade state — which is exactly what the G-1 audit finding is about. We don't recommend it. If something is forcing your hand, capture the operational issue you're hitting and open a GitHub issue against the certctl repo with the SHAs involved; the Authenticating-gateway pattern was specifically designed to cover the use cases that historically led operators to set `CERTCTL_AUTH_TYPE=jwt`.
|
||||
|
||||
There is no on-disk state that changes with this upgrade — no migrations to roll back, no encrypted config to re-encode, no certificates to re-issue. The change is entirely in the config-validation surface and the helm-chart template guard.
|
||||
|
||||
## Cross-references
|
||||
|
||||
- [`architecture.md`](architecture.md) — "Authenticating-gateway pattern (JWT, OIDC, mTLS)" section.
|
||||
- [`tls.md`](tls.md) — TLS provisioning patterns. The gateway proxying to certctl-server still needs to trust certctl's TLS cert; same patterns apply.
|
||||
- [`../deploy/helm/certctl/README.md`](../deploy/helm/certctl/README.md) — Helm-chart-flavored guidance.
|
||||
- `internal/config/config.go::ValidAuthTypes` — the single source of truth for what's accepted post-G-1.
|
||||
- `internal/repository/postgres/db.go::wrapPingError` — unrelated; pattern for runtime diagnostic of operator misconfiguration.
|
||||
- `coverage-gap-audit-2026-04-24-v5/unified-audit.md` — the audit finding (`cat-g-jwt_silent_auth_downgrade`).
|
||||
@@ -0,0 +1,55 @@
|
||||
# Deployment Examples
|
||||
|
||||
Five turnkey docker-compose scenarios that show certctl deployed against real CA backends and target shapes. Each subdirectory is self-contained — pick the one closest to your stack and have it running in minutes.
|
||||
|
||||
| Example | Stack | What it shows |
|
||||
|---------|-------|---------------|
|
||||
| [`acme-nginx/`](acme-nginx/acme-nginx.md) | Let's Encrypt + NGINX (HTTP-01) | The default public-CA path: ACME-issued certs deployed to NGINX. |
|
||||
| [`acme-wildcard-dns01/`](acme-wildcard-dns01/acme-wildcard-dns01.md) | Let's Encrypt wildcard (DNS-01) | Wildcard certificates via DNS-01 with pluggable DNS hooks. |
|
||||
| [`private-ca-traefik/`](private-ca-traefik/private-ca-traefik.md) | Local CA + Traefik | Internal-only certs from a private CA, deployed to Traefik. |
|
||||
| [`step-ca-haproxy/`](step-ca-haproxy/step-ca-haproxy.md) | Smallstep step-ca + HAProxy | Self-hosted CA with HAProxy as the deployment target. |
|
||||
| [`multi-issuer/`](multi-issuer/multi-issuer.md) | Let's Encrypt + Local CA | Public + private certs side-by-side from a single dashboard. |
|
||||
|
||||
## Common operational notes
|
||||
|
||||
These notes apply to **every** example. They're called out here so the per-example walkthroughs stay focused on the issuer/target wiring instead of repeating ops boilerplate.
|
||||
|
||||
### Postgres password rotation — first-boot binding trap (U-1)
|
||||
|
||||
Every example file uses `${DB_PASSWORD:-certctl-dev-password}` as the postgres password env var, with the data directory persisted via a named volume. The `postgres:16-alpine` image runs `initdb` exactly once — when `/var/lib/postgresql/data` is empty — and that's the only time `POSTGRES_PASSWORD` is written into `pg_authid`. If you boot once with the default and then change `DB_PASSWORD` (in your shell, in a `.env` file, or in a wrapper script), the certctl-server container picks up the new value but the postgres container continues to authenticate against the old one. The server fails its startup `db.Ping()` with `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01).
|
||||
|
||||
The certctl-server emits guidance pointing at the fix when this fires (see `internal/repository/postgres/db.go::wrapPingError`). The two remediation paths:
|
||||
|
||||
- **Destructive — wipes all certctl data, only acceptable on demo/test setups:**
|
||||
```bash
|
||||
docker compose -f examples/<example>/docker-compose.yml down -v
|
||||
docker compose -f examples/<example>/docker-compose.yml up -d --build
|
||||
```
|
||||
- **Non-destructive — preserves data, rotates `pg_authid` in place:**
|
||||
```bash
|
||||
docker compose -f examples/<example>/docker-compose.yml exec postgres \
|
||||
psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"
|
||||
# Then redeploy with DB_PASSWORD set to <new> in your shell or .env
|
||||
```
|
||||
|
||||
The cleanest practice for a fresh demo: set `DB_PASSWORD` once in your shell **before** the very first `docker compose up`, and don't change it during the demo's lifetime. If you must rotate, use the non-destructive path.
|
||||
|
||||
Same root cause and remediation pattern is documented for the canonical quickstart in [`../docs/quickstart.md`](../docs/quickstart.md), the production compose surface in [`../deploy/ENVIRONMENTS.md`](../deploy/ENVIRONMENTS.md), and the Helm chart in [`../deploy/helm/certctl/README.md`](../deploy/helm/certctl/README.md).
|
||||
|
||||
### TLS for the certctl control plane
|
||||
|
||||
Every example boots certctl with HTTPS-only on port 8443 (TLS 1.3 pinned, no plaintext listener as of v2.2). The shipped `certctl-tls-init` init container generates a self-signed ECDSA-P256 cert on first boot — fine for the example demos, **never** acceptable for a public deployment. For production, swap the init container for cert-manager, an operator-supplied Secret, or your internal CA — see [`../docs/tls.md`](../docs/tls.md) for the full pattern matrix.
|
||||
|
||||
### Tearing down
|
||||
|
||||
To stop services but **keep** the postgres volume (so you can pick up where you left off):
|
||||
```bash
|
||||
docker compose -f examples/<example>/docker-compose.yml down
|
||||
```
|
||||
|
||||
To stop services **and** wipe all data (clean slate for the next run):
|
||||
```bash
|
||||
docker compose -f examples/<example>/docker-compose.yml down -v
|
||||
```
|
||||
|
||||
Note that `down -v` is the only canonical way to recover from the postgres-password trap when the non-destructive `ALTER ROLE` route is unavailable (e.g., you've forgotten the original password).
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This example demonstrates certctl's core use case: **automatically manage TLS certificates for NGINX using Let's Encrypt (ACME HTTP-01 challenges).**
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
## What This Does
|
||||
|
||||
- Deploys certctl server (control plane) with PostgreSQL
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
**What this does:** Issues wildcard certificates (e.g., `*.example.com`) from Let's Encrypt using DNS-01 challenge validation.
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
This example is ideal for:
|
||||
- Issuing wildcard certificates (`*.example.com`)
|
||||
- Services behind NAT, firewalls, or non-public networks
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This example demonstrates certctl managing **both public and internal certificates from a single dashboard**. Public-facing services use Let's Encrypt (ACME), while internal services use a private Local CA — all visible and managed in one place.
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
## The Use Case
|
||||
|
||||
You have:
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
# Private CA + Traefik Example
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
This example demonstrates certctl managing certificates for **internal services without public CA dependency**. Ideal for enterprise environments where:
|
||||
|
||||
- All services are internal (VPN, private networks)
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This example demonstrates certctl managing certificates issued by **Smallstep step-ca** and deploying them to **HAProxy**.
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
## Scenario
|
||||
|
||||
You're a Smallstep user running step-ca as your internal PKI. You have HAProxy load balancers that need certificates. This setup:
|
||||
|
||||
@@ -893,3 +893,161 @@ func TestAgentReportJobStatus_ServiceError(t *testing.T) {
|
||||
func stringPtr(s string) *string {
|
||||
return &s
|
||||
}
|
||||
|
||||
// G-2 (P1): cat-s5-apikey_leak audit closure tests. Pre-G-2,
|
||||
// Agent.APIKeyHash was tagged `json:"api_key_hash"` and shipped on
|
||||
// every wire surface that returned domain.Agent. Post-G-2 the tag is
|
||||
// "-" and Agent.MarshalJSON enforces redaction via a marshal-time copy
|
||||
// (see internal/domain/connector_test.go for the type-level pin). These
|
||||
// four tests are the wire-shape contract — they capture the actual HTTP
|
||||
// response body via httptest and assert the credential-derivative hash
|
||||
// is absent.
|
||||
//
|
||||
// One sentinel value (g2HandlerLeakSentinel) flows through every fixture
|
||||
// so a single grep over a failing test's output identifies the leak
|
||||
// surface immediately.
|
||||
const g2HandlerLeakSentinel = "sha256:LEAKED-CREDENTIAL-DERIVATIVE-HANDLER-SENTINEL"
|
||||
|
||||
func TestListAgents_DoesNotLeakAPIKeyHash(t *testing.T) {
|
||||
now := time.Now()
|
||||
mock := &MockAgentService{
|
||||
ListAgentsFn: func(page, perPage int) ([]domain.Agent, int64, error) {
|
||||
return []domain.Agent{
|
||||
{ID: "a-1", Name: "agent-one", Hostname: "host-1",
|
||||
Status: domain.AgentStatusOnline, RegisteredAt: now,
|
||||
APIKeyHash: g2HandlerLeakSentinel + "-1"},
|
||||
{ID: "a-2", Name: "agent-two", Hostname: "host-2",
|
||||
Status: domain.AgentStatusOnline, RegisteredAt: now,
|
||||
APIKeyHash: g2HandlerLeakSentinel + "-2"},
|
||||
}, 2, nil
|
||||
},
|
||||
}
|
||||
h := NewAgentHandler(mock)
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/v1/agents?page=1&per_page=50", nil)
|
||||
req = req.WithContext(contextWithRequestID())
|
||||
w := httptest.NewRecorder()
|
||||
h.ListAgents(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("ListAgents status = %d, want 200", w.Code)
|
||||
}
|
||||
body := w.Body.String()
|
||||
if bytes.Contains([]byte(body), []byte("api_key_hash")) {
|
||||
t.Errorf("ListAgents response leaked \"api_key_hash\" key (G-2 regressed):\n%s", body)
|
||||
}
|
||||
if bytes.Contains([]byte(body), []byte(g2HandlerLeakSentinel)) {
|
||||
t.Errorf("ListAgents response leaked sentinel %q:\n%s", g2HandlerLeakSentinel, body)
|
||||
}
|
||||
// Sanity: the non-leaked fields ARE present (handler did serve real data).
|
||||
for _, want := range []string{"a-1", "a-2", "agent-one", "agent-two"} {
|
||||
if !bytes.Contains([]byte(body), []byte(want)) {
|
||||
t.Errorf("ListAgents response missing expected field %q (handler may not be serving data):\n%s", want, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetAgent_DoesNotLeakAPIKeyHash(t *testing.T) {
|
||||
now := time.Now()
|
||||
mock := &MockAgentService{
|
||||
GetAgentFn: func(id string) (*domain.Agent, error) {
|
||||
return &domain.Agent{
|
||||
ID: id, Name: "single-agent", Hostname: "single.host",
|
||||
Status: domain.AgentStatusOnline, RegisteredAt: now,
|
||||
APIKeyHash: g2HandlerLeakSentinel,
|
||||
}, nil
|
||||
},
|
||||
}
|
||||
h := NewAgentHandler(mock)
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/v1/agents/a-prod-001", nil)
|
||||
req = req.WithContext(contextWithRequestID())
|
||||
w := httptest.NewRecorder()
|
||||
h.GetAgent(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("GetAgent status = %d, want 200, body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
body := w.Body.String()
|
||||
if bytes.Contains([]byte(body), []byte("api_key_hash")) {
|
||||
t.Errorf("GetAgent response leaked \"api_key_hash\" key:\n%s", body)
|
||||
}
|
||||
if bytes.Contains([]byte(body), []byte(g2HandlerLeakSentinel)) {
|
||||
t.Errorf("GetAgent response leaked sentinel:\n%s", body)
|
||||
}
|
||||
if !bytes.Contains([]byte(body), []byte("single-agent")) {
|
||||
t.Errorf("GetAgent response missing the agent name (handler may not be serving data):\n%s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegisterAgent_DoesNotLeakAPIKeyHash(t *testing.T) {
|
||||
// Registration is the most likely path for a freshly-hashed key to
|
||||
// leak: the service mints a new APIKeyHash inside RegisterAgent
|
||||
// (service/agent.go:405) and the handler returns the agent struct
|
||||
// verbatim. Pin that the redaction holds even on a "freshly created"
|
||||
// agent payload.
|
||||
now := time.Now()
|
||||
mock := &MockAgentService{
|
||||
RegisterAgentFn: func(in domain.Agent) (*domain.Agent, error) {
|
||||
return &domain.Agent{
|
||||
ID: "agent-new", Name: in.Name, Hostname: in.Hostname,
|
||||
Status: domain.AgentStatusOnline, RegisteredAt: now,
|
||||
APIKeyHash: g2HandlerLeakSentinel,
|
||||
}, nil
|
||||
},
|
||||
}
|
||||
h := NewAgentHandler(mock)
|
||||
body := bytes.NewBufferString(`{"name":"freshly-registered","hostname":"new.host"}`)
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/agents", body)
|
||||
req = req.WithContext(contextWithRequestID())
|
||||
w := httptest.NewRecorder()
|
||||
h.RegisterAgent(w, req)
|
||||
|
||||
if w.Code != http.StatusCreated {
|
||||
t.Fatalf("RegisterAgent status = %d, want 201, body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
respBody := w.Body.String()
|
||||
if bytes.Contains([]byte(respBody), []byte("api_key_hash")) {
|
||||
t.Errorf("RegisterAgent response leaked \"api_key_hash\" key:\n%s", respBody)
|
||||
}
|
||||
if bytes.Contains([]byte(respBody), []byte(g2HandlerLeakSentinel)) {
|
||||
t.Errorf("RegisterAgent response leaked sentinel:\n%s", respBody)
|
||||
}
|
||||
if !bytes.Contains([]byte(respBody), []byte("agent-new")) {
|
||||
t.Errorf("RegisterAgent response missing the new agent ID (handler may not be serving data):\n%s", respBody)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListRetiredAgents_DoesNotLeakAPIKeyHash(t *testing.T) {
|
||||
// I-004 surface — separate handler from ListAgents; same leak risk.
|
||||
now := time.Now()
|
||||
retiredAt := now.Add(-1 * time.Hour)
|
||||
reason := "test cascade"
|
||||
mock := &MockAgentService{
|
||||
ListRetiredAgentsFn: func(page, perPage int) ([]domain.Agent, int64, error) {
|
||||
return []domain.Agent{
|
||||
{ID: "ret-1", Name: "retired-one", Hostname: "host-r1",
|
||||
Status: domain.AgentStatusOffline, RegisteredAt: now,
|
||||
RetiredAt: &retiredAt, RetiredReason: &reason,
|
||||
APIKeyHash: g2HandlerLeakSentinel},
|
||||
}, 1, nil
|
||||
},
|
||||
}
|
||||
h := NewAgentHandler(mock)
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/v1/agents/retired?page=1&per_page=50", nil)
|
||||
req = req.WithContext(contextWithRequestID())
|
||||
w := httptest.NewRecorder()
|
||||
h.ListRetiredAgents(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("ListRetiredAgents status = %d, want 200, body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
body := w.Body.String()
|
||||
if bytes.Contains([]byte(body), []byte("api_key_hash")) {
|
||||
t.Errorf("ListRetiredAgents response leaked \"api_key_hash\" key:\n%s", body)
|
||||
}
|
||||
if bytes.Contains([]byte(body), []byte(g2HandlerLeakSentinel)) {
|
||||
t.Errorf("ListRetiredAgents response leaked sentinel:\n%s", body)
|
||||
}
|
||||
if !bytes.Contains([]byte(body), []byte("ret-1")) {
|
||||
t.Errorf("ListRetiredAgents response missing the retired agent ID:\n%s", body)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,8 +7,14 @@ import (
|
||||
)
|
||||
|
||||
// HealthHandler handles health and readiness check endpoints.
|
||||
//
|
||||
// G-1 (P1): AuthType is one of "api-key" or "none" — see
|
||||
// internal/config.AuthType / config.ValidAuthTypes() for the typed
|
||||
// constants and the rationale for dropping "jwt" (no JWT middleware
|
||||
// ships with certctl; operators who need JWT/OIDC front certctl with
|
||||
// an authenticating gateway and set AuthType="none" on the upstream).
|
||||
type HealthHandler struct {
|
||||
AuthType string // "api-key", "jwt", "none"
|
||||
AuthType string // "api-key" or "none" (see config.AuthType constants)
|
||||
}
|
||||
|
||||
// NewHealthHandler creates a new HealthHandler.
|
||||
|
||||
@@ -162,30 +162,14 @@ func TestAuthInfo_ReturnsAuthType_None(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthInfo_ReturnsAuthType_JWT(t *testing.T) {
|
||||
handler := NewHealthHandler("jwt")
|
||||
|
||||
req, err := http.NewRequest(http.MethodGet, "/api/v1/auth/info", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("NewRequest failed: %v", err)
|
||||
}
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
handler.AuthInfo(w, req)
|
||||
|
||||
var result map[string]interface{}
|
||||
if err := json.NewDecoder(w.Body).Decode(&result); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
if result["auth_type"] != "jwt" {
|
||||
t.Errorf("auth_type = %q, want jwt", result["auth_type"])
|
||||
}
|
||||
|
||||
if required, ok := result["required"].(bool); !ok || !required {
|
||||
t.Errorf("required = %v, want true", result["required"])
|
||||
}
|
||||
}
|
||||
// G-1 (P1): the prior `TestAuthInfo_ReturnsAuthType_JWT` asserted the
|
||||
// handler echoed "jwt" — using the silent-auth-downgrade value as a
|
||||
// test fixture, which baked the lie into the regression suite. The
|
||||
// test is removed because "jwt" is now rejected at config-load time
|
||||
// (see internal/config/config_test.go::TestValidate_JWTAuth_RejectedDedicated)
|
||||
// and never reaches this handler. The pre-existing
|
||||
// `TestAuthInfo_ReturnsAuthType_APIKey` above (line ~107) covers the
|
||||
// api-key happy path; nothing else needs replacing here.
|
||||
|
||||
func TestAuthCheck_ReturnsOK(t *testing.T) {
|
||||
handler := NewHealthHandler("api-key")
|
||||
|
||||
@@ -117,8 +117,14 @@ func HashAPIKey(key string) string {
|
||||
}
|
||||
|
||||
// AuthConfig holds configuration for the Auth middleware.
|
||||
//
|
||||
// G-1 (P1): valid Type values are "api-key" or "none" only. "jwt" was
|
||||
// removed because no JWT middleware ships with certctl (silent auth
|
||||
// downgrade pre-G-1). The single source of truth for the allowed set
|
||||
// lives at internal/config.AuthType / config.ValidAuthTypes() — prefer
|
||||
// those constants over string literals when comparing.
|
||||
type AuthConfig struct {
|
||||
Type string // "api-key", "jwt", "none"
|
||||
Type string // "api-key" or "none" (see config.AuthType constants)
|
||||
Secret string // The raw API key or comma-separated list of valid API keys
|
||||
}
|
||||
|
||||
|
||||
+80
-12
@@ -802,13 +802,59 @@ type NamedAPIKey struct {
|
||||
Admin bool
|
||||
}
|
||||
|
||||
// AuthType is the discriminator for the API auth middleware shape. The
|
||||
// string alias preserves env-var roundtrip (the value flows through getEnv
|
||||
// as a plain string) while giving us a typed surface for switches and
|
||||
// validation. Use the named constants below rather than string literals
|
||||
// so future enum additions/removals are caught at compile time.
|
||||
//
|
||||
// G-1 (P1): the pre-G-1 validAuthTypes map literal accepted "jwt" with no
|
||||
// JWT middleware behind it (silent auth downgrade — the configured type
|
||||
// was logged as "jwt" but every request routed through the api-key bearer
|
||||
// middleware regardless). Operators who set CERTCTL_AUTH_TYPE=jwt thought
|
||||
// they had JWT auth; they didn't. The typed alias + ValidAuthTypes()
|
||||
// helper make the allowed set the single source of truth across config
|
||||
// validation, the runtime defense-in-depth switch in main.go, and the
|
||||
// helm-chart template guard (`certctl.validateAuthType`).
|
||||
type AuthType string
|
||||
|
||||
const (
|
||||
// AuthTypeAPIKey routes requests through the api-key bearer middleware.
|
||||
// CERTCTL_AUTH_SECRET (or CERTCTL_API_KEYS_NAMED) is required.
|
||||
AuthTypeAPIKey AuthType = "api-key"
|
||||
|
||||
// AuthTypeNone disables authentication entirely. Development only —
|
||||
// the server logs a loud Warn at startup. Operators who need
|
||||
// JWT/OIDC/mTLS run an authenticating gateway (oauth2-proxy / Envoy
|
||||
// ext_authz / Traefik ForwardAuth / Pomerium) in front of certctl
|
||||
// and set this value on the upstream certctl process. See
|
||||
// docs/architecture.md "Authenticating-gateway pattern".
|
||||
AuthTypeNone AuthType = "none"
|
||||
)
|
||||
|
||||
// ValidAuthTypes returns the allowed CERTCTL_AUTH_TYPE values. The set is
|
||||
// intentionally narrow — JWT was accepted pre-G-1 with no middleware
|
||||
// implementation behind it. Single source of truth referenced by the
|
||||
// validator below, the runtime guard in cmd/server/main.go, the helm
|
||||
// chart template (`certctl.validateAuthType`), and the property test in
|
||||
// config_test.go that pins "jwt" out of the slice forever.
|
||||
func ValidAuthTypes() []AuthType {
|
||||
return []AuthType{AuthTypeAPIKey, AuthTypeNone}
|
||||
}
|
||||
|
||||
// AuthConfig contains authentication configuration.
|
||||
type AuthConfig struct {
|
||||
// Type sets the authentication mechanism for the REST API.
|
||||
// Valid values: "api-key" (default, production), "jwt", "none" (development only).
|
||||
// When "api-key", clients must provide Authorization: Bearer <key> header.
|
||||
// "none" requires explicit opt-in via CERTCTL_AUTH_TYPE env var with warning logged.
|
||||
// Valid values: "api-key" (default, production) and "none" (development
|
||||
// only — disables authentication on the API and logs a loud Warn at
|
||||
// startup). For JWT/OIDC, run an authenticating gateway (oauth2-proxy /
|
||||
// Envoy / Traefik ForwardAuth / Pomerium) in front of certctl and set
|
||||
// CERTCTL_AUTH_TYPE=none on the upstream — see docs/architecture.md
|
||||
// "Authenticating-gateway pattern" and docs/upgrade-to-v2-jwt-removal.md.
|
||||
// Setting: CERTCTL_AUTH_TYPE environment variable. Default: "api-key".
|
||||
// Use the AuthType constants (AuthTypeAPIKey / AuthTypeNone) for typed
|
||||
// comparisons; the field stays `string` to preserve env-var roundtrip
|
||||
// shape used by getEnv() and downstream Helm/compose interpolation.
|
||||
Type string
|
||||
|
||||
// Secret is the legacy authentication secret (comma-separated API keys).
|
||||
@@ -1148,18 +1194,40 @@ func (c *Config) Validate() error {
|
||||
return fmt.Errorf("invalid log format: %s", c.Log.Format)
|
||||
}
|
||||
|
||||
// Validate auth type
|
||||
validAuthTypes := map[string]bool{
|
||||
"api-key": true,
|
||||
"jwt": true,
|
||||
"none": true,
|
||||
// Validate auth type.
|
||||
//
|
||||
// G-1 (P1): the pre-G-1 set was {"api-key", "jwt", "none"} with "jwt"
|
||||
// accepted but no JWT middleware shipped — silent auth downgrade.
|
||||
// Post-G-1 we route a literal "jwt" value through a dedicated
|
||||
// rejection that gives operators actionable guidance (the
|
||||
// authenticating-gateway pattern) instead of the generic
|
||||
// "invalid auth type". Then we cross-check against ValidAuthTypes()
|
||||
// so any value outside {api-key, none} surfaces uniformly.
|
||||
if c.Auth.Type == "jwt" {
|
||||
return fmt.Errorf(
|
||||
"CERTCTL_AUTH_TYPE=jwt is no longer accepted (G-1 silent auth " +
|
||||
"downgrade): no JWT middleware ships with certctl. To use " +
|
||||
"JWT/OIDC, run an authenticating gateway (oauth2-proxy / " +
|
||||
"Envoy ext_authz / Traefik ForwardAuth / Pomerium) in " +
|
||||
"front of certctl and set CERTCTL_AUTH_TYPE=none on the " +
|
||||
"upstream. See docs/architecture.md \"Authenticating-" +
|
||||
"gateway pattern\" and docs/upgrade-to-v2-jwt-removal.md " +
|
||||
"for the migration walkthrough")
|
||||
}
|
||||
if !validAuthTypes[c.Auth.Type] {
|
||||
return fmt.Errorf("invalid auth type: %s", c.Auth.Type)
|
||||
authTypeValid := false
|
||||
for _, t := range ValidAuthTypes() {
|
||||
if AuthType(c.Auth.Type) == t {
|
||||
authTypeValid = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !authTypeValid {
|
||||
return fmt.Errorf("invalid auth type: %s (valid: %v)", c.Auth.Type, ValidAuthTypes())
|
||||
}
|
||||
|
||||
// If using JWT or API-key, secret is required
|
||||
if (c.Auth.Type == "jwt" || c.Auth.Type == "api-key") && c.Auth.Secret == "" {
|
||||
// If using API-key, secret is required. (Secret was previously also
|
||||
// required for "jwt"; removed with the jwt rejection above.)
|
||||
if c.Auth.Type == string(AuthTypeAPIKey) && c.Auth.Secret == "" {
|
||||
return fmt.Errorf("auth secret is required for auth type %s", c.Auth.Type)
|
||||
}
|
||||
|
||||
|
||||
+149
-15
@@ -458,6 +458,8 @@ func TestValidate_InvalidAuthType(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -477,6 +479,8 @@ func TestValidate_APIKeyAuth_MissingSecret(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -484,25 +488,133 @@ func TestValidate_APIKeyAuth_MissingSecret(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidate_JWTAuth_MissingSecret(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: validServerConfig(t),
|
||||
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||
Log: LogConfig{Level: "info", Format: "json"},
|
||||
Auth: AuthConfig{Type: "jwt", Secret: ""},
|
||||
Keygen: KeygenConfig{Mode: "agent"},
|
||||
Scheduler: SchedulerConfig{
|
||||
RenewalCheckInterval: 1 * time.Hour,
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
},
|
||||
// TestValidate_JWTAuth_RejectedDedicated locks down the G-1 fix: pre-G-1
|
||||
// `CERTCTL_AUTH_TYPE=jwt` was accepted by the validator (the bare error
|
||||
// path was the empty-secret one previously). Post-G-1 the literal "jwt"
|
||||
// value is rejected with a dedicated diagnostic regardless of whether
|
||||
// Secret is set, because there is no JWT middleware in the binary —
|
||||
// operators who need JWT/OIDC must front certctl with an authenticating
|
||||
// gateway.
|
||||
//
|
||||
// Two table rows pin the contract: missing-secret cannot paper over the
|
||||
// rejection (the dedicated error fires first, before the secret check),
|
||||
// and a populated secret also cannot paper over it. Both paths must
|
||||
// hit the dedicated G-1 diagnostic, not the generic "invalid auth
|
||||
// type" or "auth secret is required".
|
||||
func TestValidate_JWTAuth_RejectedDedicated(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := []struct {
|
||||
name string
|
||||
secret string
|
||||
}{
|
||||
{"jwt rejected (no secret)", ""},
|
||||
{"jwt rejected (with secret — operator can't paper over)", "anything"},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Error("Validate() should return error when jwt auth has empty secret")
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: validServerConfig(t),
|
||||
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||
Log: LogConfig{Level: "info", Format: "json"},
|
||||
Auth: AuthConfig{Type: "jwt", Secret: tc.secret},
|
||||
Keygen: KeygenConfig{Mode: "agent"},
|
||||
Scheduler: SchedulerConfig{
|
||||
RenewalCheckInterval: 1 * time.Hour,
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
if err == nil {
|
||||
t.Fatal("Validate() returned nil; expected dedicated G-1 rejection")
|
||||
}
|
||||
const wantSubstr = "CERTCTL_AUTH_TYPE=jwt is no longer accepted"
|
||||
if !strings.Contains(err.Error(), wantSubstr) {
|
||||
t.Errorf("Validate() = %v\nwant substring %q (the dedicated G-1 diagnostic)", err, wantSubstr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidAuthTypesDoesNotContainJWT is a property-level guard against
|
||||
// a future PR silently re-introducing "jwt" into the allowed set. If
|
||||
// someone adds JWT back to ValidAuthTypes(), this test fails immediately
|
||||
// with a pointer at the audit finding. The matching CI grep guardrail
|
||||
// in .github/workflows/ci.yml provides a secondary check at build time.
|
||||
func TestValidAuthTypesDoesNotContainJWT(t *testing.T) {
|
||||
t.Parallel()
|
||||
for _, at := range ValidAuthTypes() {
|
||||
if at == "jwt" {
|
||||
t.Fatalf("jwt is in ValidAuthTypes — silent auth downgrade regressed (G-1)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidAuthTypesIsExactly_APIKey_None pins the current allowed set.
|
||||
// If a future change adds a new auth type, this test must be updated
|
||||
// alongside the validator and the helm-chart `validateAuthType` helper —
|
||||
// keeping all three surfaces in sync.
|
||||
func TestValidAuthTypesIsExactly_APIKey_None(t *testing.T) {
|
||||
t.Parallel()
|
||||
got := ValidAuthTypes()
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("ValidAuthTypes() returned %d entries, want 2: %v", len(got), got)
|
||||
}
|
||||
want := map[AuthType]bool{AuthTypeAPIKey: true, AuthTypeNone: true}
|
||||
for _, at := range got {
|
||||
if !want[at] {
|
||||
t.Errorf("unexpected auth type in ValidAuthTypes: %q", at)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidate_GenericInvalidAuthType ensures that values outside the
|
||||
// allowed set (other than the special-cased "jwt") still surface the
|
||||
// generic "invalid auth type" error. Pins that the dedicated G-1
|
||||
// rejection didn't accidentally swallow non-jwt typos.
|
||||
func TestValidate_GenericInvalidAuthType(t *testing.T) {
|
||||
t.Parallel()
|
||||
for _, badType := range []string{"", "garbage", "oidc", "mtls", "API-KEY"} {
|
||||
t.Run("type="+badType, func(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: validServerConfig(t),
|
||||
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||
Log: LogConfig{Level: "info", Format: "json"},
|
||||
Auth: AuthConfig{Type: badType, Secret: "x"},
|
||||
Keygen: KeygenConfig{Mode: "agent"},
|
||||
Scheduler: SchedulerConfig{
|
||||
RenewalCheckInterval: 1 * time.Hour,
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
if err == nil {
|
||||
t.Fatalf("Validate(type=%q) returned nil; expected invalid-auth-type rejection", badType)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "invalid auth type") {
|
||||
t.Errorf("Validate(type=%q) = %v; want \"invalid auth type\" error", badType, err)
|
||||
}
|
||||
if strings.Contains(err.Error(), "G-1 silent auth") {
|
||||
t.Errorf("Validate(type=%q) = %v; should not hit the dedicated G-1 path for non-jwt values", badType, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// G-1 (P1): no need to add `TestValidate_NoneAuth_AcceptsEmptySecret` or
|
||||
// `TestValidate_APIKeyAuth_RequiresSecret` here — the pre-existing tests
|
||||
// `TestValidate_AuthTypeNone` (above) and `TestValidate_APIKeyAuth_MissingSecret`
|
||||
// (above) already cover those paths. Documented for the next reader: the
|
||||
// G-1 fix flipped jwt off but did not disturb either the
|
||||
// none-bypasses-secret or the api-key-requires-secret behavior.
|
||||
|
||||
func TestValidate_InvalidKeygenMode(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: validServerConfig(t),
|
||||
@@ -515,6 +627,8 @@ func TestValidate_InvalidKeygenMode(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -544,6 +658,8 @@ func TestValidate_InvalidPort(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -574,6 +690,8 @@ func TestValidate_TLSCertPathEmpty(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -605,6 +723,8 @@ func TestValidate_TLSKeyPathEmpty(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -637,6 +757,8 @@ func TestValidate_TLSCertFileMissing(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -668,6 +790,8 @@ func TestValidate_TLSKeyFileMissing(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -701,6 +825,8 @@ func TestValidate_TLSMismatchedPair(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -724,6 +850,8 @@ func TestValidate_EmptyDatabaseURL(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -743,6 +871,8 @@ func TestValidate_InvalidLogLevel(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -762,6 +892,8 @@ func TestValidate_InvalidLogFormat(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -840,6 +972,8 @@ func TestValidate_DatabaseMaxConnectionsZero(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
|
||||
@@ -46,7 +46,26 @@ type Agent struct {
|
||||
Status AgentStatus `json:"status"`
|
||||
LastHeartbeatAt *time.Time `json:"last_heartbeat_at,omitempty"`
|
||||
RegisteredAt time.Time `json:"registered_at"`
|
||||
APIKeyHash string `json:"api_key_hash"`
|
||||
// APIKeyHash is the SHA-256 of the agent's plaintext API key,
|
||||
// populated by service.RegisterAgent (`hashAPIKey(apiKey)`) and
|
||||
// consumed by repository.AgentRepository::GetByAPIKey at auth time.
|
||||
// It is server-internal: never serialized to clients, never echoed
|
||||
// via CLI / MCP / agent registration response, never logged.
|
||||
//
|
||||
// G-2 (P1): pre-G-2 the field was tagged `json:"api_key_hash"` and
|
||||
// shipped on every /api/v1/agents response (cat-s5-apikey_leak). Even
|
||||
// SHA-256 should not be shipped to clients — it gives an offline
|
||||
// brute-force target if API-key entropy is low (certctl doesn't enforce
|
||||
// a minimum on operator-supplied keys), and there is no business reason
|
||||
// for any client to ever receive it. Post-G-2 the JSON tag is "-" and
|
||||
// Agent.MarshalJSON below zeroes the field on a copy before delegating
|
||||
// to the default marshal — defense in depth so a future tag-revert by
|
||||
// refactor cannot reopen the leak. The DB column, repo SELECT/INSERT/
|
||||
// UPDATE paths, and service-side hashing are unchanged. See
|
||||
// docs/architecture.md ER diagram (which documents DB shape, not API
|
||||
// shape) and coverage-gap-audit-2026-04-24-v5/unified-audit.md
|
||||
// cat-s5-apikey_leak for the full closure rationale.
|
||||
APIKeyHash string `json:"-"`
|
||||
OS string `json:"os"`
|
||||
Architecture string `json:"architecture"`
|
||||
IPAddress string `json:"ip_address"`
|
||||
@@ -60,6 +79,31 @@ type Agent struct {
|
||||
RetiredReason *string `json:"retired_reason,omitempty"`
|
||||
}
|
||||
|
||||
// MarshalJSON implements json.Marshaler. It explicitly zeros APIKeyHash
|
||||
// before serialization to defense-in-depth the `json:"-"` tag above.
|
||||
//
|
||||
// G-2 (P1): pre-G-2 the field was tagged `json:"api_key_hash"` and
|
||||
// shipped on every /api/v1/agents response (cat-s5-apikey_leak). Post-G-2
|
||||
// the tag is "-" and this method enforces redaction even if the tag is
|
||||
// reverted by a future refactor — the receiver is by-value so the
|
||||
// APIKeyHash = "" assignment mutates only the marshal-time copy, never
|
||||
// the caller's original. The type-alias trick (`type alias Agent`)
|
||||
// breaks the recursive MarshalJSON call that would otherwise stack-
|
||||
// overflow. Both *Agent and Agent receivers route through here because
|
||||
// the json package looks the method up via reflect.Value, and a value
|
||||
// receiver satisfies both kinds of pointer.
|
||||
//
|
||||
// Auditor's note for the next reader: do NOT remove this method even if
|
||||
// the json:"-" tag stays. The CI guardrail at .github/workflows/ci.yml
|
||||
// also blocks reintroduction at the tag site, but this method is the
|
||||
// last line of defense for serialization paths that bypass struct tags
|
||||
// (e.g., a future MarshalJSON on a parent struct that embeds Agent).
|
||||
func (a Agent) MarshalJSON() ([]byte, error) {
|
||||
type alias Agent // breaks recursion: alias has no MarshalJSON method
|
||||
a.APIKeyHash = ""
|
||||
return json.Marshal(alias(a))
|
||||
}
|
||||
|
||||
// IsRetired returns true when this agent has been soft-retired.
|
||||
// I-004: callers that iterate active agents (stats dashboard, stale-offline
|
||||
// sweeper, handler-facing list) must skip retired rows by default.
|
||||
|
||||
@@ -1,10 +1,22 @@
|
||||
package domain
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// jsonMarshalDirect / jsonUnmarshalDirect are thin aliases for the
|
||||
// stdlib encoding/json calls. They exist only to make the G-2 redaction
|
||||
// tests below grep-friendly: any future test author searching for "how
|
||||
// is APIKeyHash redaction tested" will land on these and the call sites,
|
||||
// rather than having to grep through dozens of unrelated json.Marshal
|
||||
// usages.
|
||||
func jsonMarshalDirect(v interface{}) ([]byte, error) { return json.Marshal(v) }
|
||||
func jsonUnmarshalDirect(data []byte, v interface{}) error { return json.Unmarshal(data, v) }
|
||||
func containsSubstr(haystack, needle string) bool { return strings.Contains(haystack, needle) }
|
||||
|
||||
// TestAgent_IsRetired covers the I-004 soft-retirement predicate that gates
|
||||
// which callers hide an agent row from active listings.
|
||||
func TestAgent_IsRetired(t *testing.T) {
|
||||
@@ -53,3 +65,172 @@ func TestAgentDependencyCounts_HasDependencies(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// G-2 (P1): the cat-s5-apikey_leak audit closure tests. Pre-G-2,
|
||||
// Agent.APIKeyHash was tagged `json:"api_key_hash"` and shipped on
|
||||
// every /api/v1/agents response — credential-derivative leak that gave
|
||||
// offline brute-force targets to every authenticated client. Post-G-2
|
||||
// the tag is "-" AND Agent.MarshalJSON zeroes the field on a marshal-
|
||||
// time copy. These tests pin both layers of the defense:
|
||||
//
|
||||
// 1. A populated APIKeyHash is never present in the marshaled JSON.
|
||||
// 2. The redaction holds on *Agent, on slice elements, and on a
|
||||
// sentinel literal-value check (so even a future field that
|
||||
// happens to contain the same hash string would not appear).
|
||||
// 3. The marshal-time copy does not mutate the caller's original —
|
||||
// receiver is by-value, but pin it explicitly so a future refactor
|
||||
// that switches to pointer-receiver gets caught.
|
||||
// 4. Round-trip preserves every other field (hash dropped on encode,
|
||||
// cannot reappear on decode because the wire never carries it).
|
||||
|
||||
const g2LeakSentinel = "sha256:LEAKED-CREDENTIAL-DERIVATIVE-SENTINEL"
|
||||
|
||||
// TestAgent_MarshalJSON_RedactsAPIKeyHash is the marshal-boundary
|
||||
// contract test: a single Agent value with a populated APIKeyHash must
|
||||
// not emit the field name nor the sentinel value.
|
||||
func TestAgent_MarshalJSON_RedactsAPIKeyHash(t *testing.T) {
|
||||
t.Parallel()
|
||||
a := Agent{
|
||||
ID: "agent-test",
|
||||
Name: "test-agent",
|
||||
Hostname: "host.example",
|
||||
Status: AgentStatusOnline,
|
||||
RegisteredAt: time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
APIKeyHash: g2LeakSentinel,
|
||||
OS: "linux",
|
||||
Architecture: "amd64",
|
||||
IPAddress: "10.0.0.1",
|
||||
Version: "v2.0.49",
|
||||
}
|
||||
out, err := jsonMarshalDirect(a)
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal returned error: %v", err)
|
||||
}
|
||||
body := string(out)
|
||||
if containsSubstr(body, "api_key_hash") {
|
||||
t.Errorf("marshaled body contains \"api_key_hash\" key — G-2 leak regressed:\n%s", body)
|
||||
}
|
||||
if containsSubstr(body, "APIKeyHash") {
|
||||
t.Errorf("marshaled body contains \"APIKeyHash\" — type-alias redaction broke:\n%s", body)
|
||||
}
|
||||
if containsSubstr(body, g2LeakSentinel) {
|
||||
t.Errorf("marshaled body contains the leak sentinel %q — value redaction broke:\n%s", g2LeakSentinel, body)
|
||||
}
|
||||
// Sanity: every OTHER non-zero field IS present (this guards against
|
||||
// the type-alias trick accidentally dropping siblings).
|
||||
for _, want := range []string{"agent-test", "test-agent", "host.example", "Online", "linux", "amd64", "10.0.0.1", "v2.0.49"} {
|
||||
if !containsSubstr(body, want) {
|
||||
t.Errorf("marshaled body missing expected field value %q:\n%s", want, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgent_MarshalJSON_RedactsViaPointer covers the *Agent path that
|
||||
// handlers hit when calling JSON(w, http.StatusOK, agent) with a *Agent
|
||||
// from svc.GetAgent. A value-receiver MarshalJSON is reachable from
|
||||
// pointer values via reflect; this test pins that contract.
|
||||
func TestAgent_MarshalJSON_RedactsViaPointer(t *testing.T) {
|
||||
t.Parallel()
|
||||
a := &Agent{ID: "agent-x", APIKeyHash: g2LeakSentinel}
|
||||
out, err := jsonMarshalDirect(a)
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal *Agent returned error: %v", err)
|
||||
}
|
||||
if containsSubstr(string(out), g2LeakSentinel) {
|
||||
t.Errorf("*Agent marshal leaked sentinel:\n%s", string(out))
|
||||
}
|
||||
if containsSubstr(string(out), "api_key_hash") {
|
||||
t.Errorf("*Agent marshal contains \"api_key_hash\" key:\n%s", string(out))
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgent_MarshalJSON_RedactsInSlice covers the []domain.Agent path
|
||||
// the ListAgents handler emits via PagedResponse{Data: agents}. Each
|
||||
// element must be redacted independently.
|
||||
func TestAgent_MarshalJSON_RedactsInSlice(t *testing.T) {
|
||||
t.Parallel()
|
||||
agents := []Agent{
|
||||
{ID: "agent-1", APIKeyHash: g2LeakSentinel + "-1"},
|
||||
{ID: "agent-2", APIKeyHash: g2LeakSentinel + "-2"},
|
||||
{ID: "agent-3", APIKeyHash: g2LeakSentinel + "-3"},
|
||||
}
|
||||
out, err := jsonMarshalDirect(agents)
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal []Agent returned error: %v", err)
|
||||
}
|
||||
body := string(out)
|
||||
if containsSubstr(body, "api_key_hash") {
|
||||
t.Errorf("[]Agent marshal contains \"api_key_hash\" key:\n%s", body)
|
||||
}
|
||||
for i := 1; i <= 3; i++ {
|
||||
sentinel := g2LeakSentinel + "-" + string(rune('0'+i))
|
||||
if containsSubstr(body, sentinel) {
|
||||
t.Errorf("[]Agent marshal leaked sentinel %q:\n%s", sentinel, body)
|
||||
}
|
||||
}
|
||||
// Every agent ID is present — the redaction didn't accidentally
|
||||
// strip the entire element.
|
||||
for _, id := range []string{"agent-1", "agent-2", "agent-3"} {
|
||||
if !containsSubstr(body, id) {
|
||||
t.Errorf("[]Agent marshal missing element ID %q:\n%s", id, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgent_MarshalJSON_DoesNotMutateReceiver pins the by-value-receiver
|
||||
// contract: marshaling must not zero APIKeyHash on the caller's struct,
|
||||
// only on the marshal-time copy. This guards against a future refactor
|
||||
// that switches to pointer receiver and breaks every code path that
|
||||
// marshals an Agent and then re-uses it (e.g., audit-event payload
|
||||
// construction immediately after returning the agent in a handler).
|
||||
func TestAgent_MarshalJSON_DoesNotMutateReceiver(t *testing.T) {
|
||||
t.Parallel()
|
||||
a := Agent{ID: "agent-keep", APIKeyHash: g2LeakSentinel}
|
||||
if _, err := jsonMarshalDirect(a); err != nil {
|
||||
t.Fatalf("Marshal returned error: %v", err)
|
||||
}
|
||||
if a.APIKeyHash != g2LeakSentinel {
|
||||
t.Errorf("MarshalJSON mutated caller's APIKeyHash: got %q want %q", a.APIKeyHash, g2LeakSentinel)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgent_MarshalJSON_RoundTrip pins the wire-shape contract: an
|
||||
// Agent marshaled to JSON and unmarshaled back into a fresh Agent has
|
||||
// every field preserved EXCEPT APIKeyHash, which the wire never carries.
|
||||
// This double-confirms the redaction is a one-way guarantee at the
|
||||
// serialization boundary, not an accidental on-decode behavior.
|
||||
func TestAgent_MarshalJSON_RoundTrip(t *testing.T) {
|
||||
t.Parallel()
|
||||
now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC)
|
||||
hb := now.Add(-5 * time.Minute)
|
||||
original := Agent{
|
||||
ID: "agent-rt",
|
||||
Name: "rt",
|
||||
Hostname: "rt.host",
|
||||
Status: AgentStatusOnline,
|
||||
LastHeartbeatAt: &hb,
|
||||
RegisteredAt: now,
|
||||
APIKeyHash: g2LeakSentinel,
|
||||
OS: "linux",
|
||||
Architecture: "arm64",
|
||||
IPAddress: "10.0.0.99",
|
||||
Version: "v2.0.49",
|
||||
}
|
||||
out, err := jsonMarshalDirect(original)
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal returned error: %v", err)
|
||||
}
|
||||
var got Agent
|
||||
if err := jsonUnmarshalDirect(out, &got); err != nil {
|
||||
t.Fatalf("Unmarshal returned error: %v", err)
|
||||
}
|
||||
if got.APIKeyHash != "" {
|
||||
t.Errorf("APIKeyHash survived round-trip: got %q want empty (the wire must not carry it)", got.APIKeyHash)
|
||||
}
|
||||
if got.ID != original.ID || got.Name != original.Name || got.Hostname != original.Hostname {
|
||||
t.Errorf("identity fields lost in round-trip: got %+v want %+v", got, original)
|
||||
}
|
||||
if got.OS != original.OS || got.Architecture != original.Architecture || got.IPAddress != original.IPAddress {
|
||||
t.Errorf("metadata fields lost in round-trip: got %+v want %+v", got, original)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,14 +2,26 @@ package postgres
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
"github.com/lib/pq"
|
||||
)
|
||||
|
||||
// pgErrInvalidPassword is the SQLSTATE for class 28 / code 28P01 —
|
||||
// invalid_password — emitted by PostgreSQL when the client presents
|
||||
// credentials that don't match pg_authid. Defined locally because the
|
||||
// lib/pq package does not export named constants for SQLSTATE codes (it
|
||||
// only exposes the typed string alias pq.ErrorCode and a name-lookup map
|
||||
// at runtime). Pinned as a string constant rather than a pq.ErrorCode
|
||||
// literal so the contract is grep-able from operator-facing log lines.
|
||||
//
|
||||
// Reference: https://www.postgresql.org/docs/16/errcodes-appendix.html
|
||||
const pgErrInvalidPassword = "28P01"
|
||||
|
||||
// NewDB opens a PostgreSQL database connection and sets up connection pooling.
|
||||
func NewDB(connStr string) (*sql.DB, error) {
|
||||
db, err := sql.Open("postgres", connStr)
|
||||
@@ -23,12 +35,65 @@ func NewDB(connStr string) (*sql.DB, error) {
|
||||
|
||||
// Ping to verify connection
|
||||
if err := db.Ping(); err != nil {
|
||||
return nil, fmt.Errorf("failed to ping database: %w", err)
|
||||
return nil, wrapPingError(err)
|
||||
}
|
||||
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// wrapPingError converts a db.Ping() failure into an operator-friendly
|
||||
// diagnostic. The default wrap is the original opaque
|
||||
// `"failed to ping database: <inner>"` shape. The exception is SQLSTATE 28P01
|
||||
// (invalid_password): when postgres rejects the server's credentials we emit
|
||||
// extended guidance that names the most common operator misstep — editing
|
||||
// POSTGRES_PASSWORD in `.env` after the postgres named volume has already
|
||||
// been initialized — and lists both the destructive (`docker compose down -v`)
|
||||
// and non-destructive (`ALTER ROLE`) remediations.
|
||||
//
|
||||
// U-1 (P1, GitHub #10): closes the audit-flagged
|
||||
// cat-u-quickstart_postgres_password_volume_trap finding. The postgres
|
||||
// docker-entrypoint runs initdb only when /var/lib/postgresql/data is empty;
|
||||
// on subsequent boots the password baked into pg_authid on first boot wins
|
||||
// over whatever the env var carries, so the env-vs-pg_authid divergence is
|
||||
// intrinsic to how the postgres image bootstraps and cannot be fixed by us
|
||||
// upstream of pg_authid. The ergonomic answer is to surface a clear
|
||||
// diagnostic at the failure site so operators don't waste an hour on
|
||||
// "is my password right" before discovering the volume needs to be torn
|
||||
// down (or the role's password rotated in-place).
|
||||
//
|
||||
// The wrap chain is preserved via fmt.Errorf("%w", err) so callers using
|
||||
// errors.As(err, &*pq.Error) on the returned value continue to work; this
|
||||
// matches the audit's "no substring matching on err.Error()" requirement
|
||||
// from the M-1 sentinel-error migration.
|
||||
//
|
||||
// Returns nil when err is nil so callers can defensively pipe through this
|
||||
// helper without an extra branch.
|
||||
func wrapPingError(err error) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var pqErr *pq.Error
|
||||
if errors.As(err, &pqErr) && string(pqErr.Code) == pgErrInvalidPassword {
|
||||
return fmt.Errorf(
|
||||
"failed to ping database: postgres rejected the configured credentials "+
|
||||
"(SQLSTATE %s — invalid_password). If you recently rotated POSTGRES_PASSWORD "+
|
||||
"on a docker-compose deploy, the postgres container's data volume still "+
|
||||
"holds the previous password: initdb seeds POSTGRES_PASSWORD into pg_authid "+
|
||||
"only on first boot of a fresh data dir, so editing the env var after that "+
|
||||
"point updates only the certctl-server container. Reset destructively with "+
|
||||
"`docker compose -f deploy/docker-compose.yml down -v && "+
|
||||
"docker compose -f deploy/docker-compose.yml up -d --build` (this DESTROYS "+
|
||||
"all data in the postgres volume), or non-destructively with "+
|
||||
"`docker compose -f deploy/docker-compose.yml exec postgres "+
|
||||
"psql -U certctl -c \"ALTER ROLE certctl PASSWORD '<new-password>';\"` "+
|
||||
"and then redeploy with the matching POSTGRES_PASSWORD. Underlying error: %w",
|
||||
pgErrInvalidPassword, err)
|
||||
}
|
||||
|
||||
return fmt.Errorf("failed to ping database: %w", err)
|
||||
}
|
||||
|
||||
// RunMigrations reads and executes SQL migration files from a directory.
|
||||
func RunMigrations(db *sql.DB, migrationsPath string) error {
|
||||
// Check if migrations directory exists
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
// Internal-package tests for db.go — covers the diagnostic dispatch in
|
||||
// wrapPingError. Lives in `package postgres` (not `postgres_test`) so it can
|
||||
// call the unexported helper directly without exposing it on the API surface.
|
||||
//
|
||||
// Sibling integration tests in this directory live in `package postgres_test`
|
||||
// (testcontainers-driven, schema-per-test). They exercise the live-DB
|
||||
// happy path; this file owns the unit-level diagnostic dispatch and runs in
|
||||
// `-short` mode without spinning up postgres.
|
||||
//
|
||||
// U-1 (P1, GitHub #10): closes the audit-flagged
|
||||
// cat-u-quickstart_postgres_password_volume_trap finding by pinning the
|
||||
// post-fix wrap-text contract for `db.Ping()` failures. Pre-U-1 every Ping
|
||||
// error was wrapped with the same opaque `"failed to ping database: %w"`,
|
||||
// so an operator who edited POSTGRES_PASSWORD after first-boot saw only
|
||||
// `pq: password authentication failed for user "certctl"` in the server
|
||||
// log with no pointer to the actual cause (postgres data dir retains the
|
||||
// initial password from first-boot initdb; subsequent boots ignore the env
|
||||
// var). Post-U-1 the SQLSTATE-28P01 path emits a multi-line diagnostic
|
||||
// pointing at the down -v / ALTER ROLE remediation; non-auth failures
|
||||
// retain the original wrap shape so verbose noise does not bleed into
|
||||
// transient connection-refused / timeout paths.
|
||||
package postgres
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/lib/pq"
|
||||
)
|
||||
|
||||
// TestWrapPingError_AuthFailureGuidance asserts the diagnostic wrap fires on
|
||||
// SQLSTATE 28P01 (invalid_password) and contains all three contract elements:
|
||||
// the SQLSTATE code (so operators can grep), the down-v destructive
|
||||
// remediation, and the ALTER ROLE non-destructive remediation. Also asserts
|
||||
// the wrap chain still satisfies errors.As(err, &*pq.Error) so callers that
|
||||
// programmatically inspect the underlying postgres error code keep working.
|
||||
func TestWrapPingError_AuthFailureGuidance(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := &pq.Error{
|
||||
Code: pq.ErrorCode("28P01"),
|
||||
Message: `password authentication failed for user "certctl"`,
|
||||
}
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
// Contract elements — the operator-facing string is what we ship.
|
||||
wantSubstrings := []string{
|
||||
"SQLSTATE 28P01", // operators grep on this
|
||||
"POSTGRES_PASSWORD", // names the variable that traps
|
||||
"first boot", // the mechanism in plain language
|
||||
"down -v", // destructive remediation
|
||||
"ALTER ROLE", // non-destructive remediation
|
||||
}
|
||||
for _, s := range wantSubstrings {
|
||||
if !strings.Contains(got, s) {
|
||||
t.Errorf("wrap text missing %q\ngot: %s", s, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap chain must still expose the underlying *pq.Error for callers
|
||||
// that want to inspect Code / Detail / Constraint fields. Pre-fix
|
||||
// callers used errors.As(err, &pqErr) on the unwrapped Ping result;
|
||||
// the new wrap is fmt.Errorf("...%w", err) so errors.As must walk it.
|
||||
var pqErr *pq.Error
|
||||
if !errors.As(wrapped, &pqErr) {
|
||||
t.Fatalf("errors.As did not extract *pq.Error from wrapped chain: %v", wrapped)
|
||||
}
|
||||
if pqErr.Code != "28P01" {
|
||||
t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "28P01")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NonAuthErrorPreservesOriginalWrap guards against the
|
||||
// guidance text bleeding into unrelated failure modes. SQLSTATE 08006
|
||||
// (connection_failure) is the canonical non-auth case — server unreachable,
|
||||
// TLS handshake failure, network drop. The wrap should be the original
|
||||
// shape so transient-error log noise does not include the (now lengthy)
|
||||
// volume-state remediation paragraph.
|
||||
func TestWrapPingError_NonAuthErrorPreservesOriginalWrap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := &pq.Error{
|
||||
Code: pq.ErrorCode("08006"),
|
||||
Message: "connection refused",
|
||||
}
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
// Original-wrap shape: prefix only, no guidance text.
|
||||
const wantPrefix = "failed to ping database: "
|
||||
if !strings.HasPrefix(got, wantPrefix) {
|
||||
t.Errorf("expected prefix %q, got: %s", wantPrefix, got)
|
||||
}
|
||||
|
||||
// Negative assertions: guidance text MUST NOT appear on non-auth paths.
|
||||
mustNotContain := []string{
|
||||
"SQLSTATE 08006", // we only call out 28P01 specifically
|
||||
"POSTGRES_PASSWORD",
|
||||
"down -v",
|
||||
"ALTER ROLE",
|
||||
}
|
||||
for _, s := range mustNotContain {
|
||||
if strings.Contains(got, s) {
|
||||
t.Errorf("non-auth wrap leaked guidance substring %q\ngot: %s", s, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap chain still walks for errors.As — same contract as auth path.
|
||||
var pqErr *pq.Error
|
||||
if !errors.As(wrapped, &pqErr) {
|
||||
t.Fatalf("errors.As did not extract *pq.Error from non-auth wrapped chain: %v", wrapped)
|
||||
}
|
||||
if pqErr.Code != "08006" {
|
||||
t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "08006")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NonPqErrorPreservesOriginalWrap guards the network-level
|
||||
// case: a pre-handshake failure (TCP refused, DNS, TLS) returns a
|
||||
// non-*pq.Error from db.Ping(). errors.As must return false, the helper
|
||||
// must fall through to the generic wrap, and the chain must remain walkable.
|
||||
func TestWrapPingError_NonPqErrorPreservesOriginalWrap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := errors.New("dial tcp 127.0.0.1:5432: connect: connection refused")
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
const wantPrefix = "failed to ping database: "
|
||||
if !strings.HasPrefix(got, wantPrefix) {
|
||||
t.Errorf("expected prefix %q, got: %s", wantPrefix, got)
|
||||
}
|
||||
if strings.Contains(got, "SQLSTATE") || strings.Contains(got, "POSTGRES_PASSWORD") {
|
||||
t.Errorf("network-level wrap leaked SQLSTATE/postgres guidance\ngot: %s", got)
|
||||
}
|
||||
if !errors.Is(wrapped, original) {
|
||||
t.Errorf("errors.Is did not walk to original sentinel: %v", wrapped)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NilReturnsNil — defensive contract: if Ping returned nil
|
||||
// (no failure), the helper must not synthesize a fake error. This isn't on
|
||||
// the documented call path (NewDB only invokes wrapPingError inside the
|
||||
// `if err != nil` branch), but pinning it prevents a future refactor from
|
||||
// regressing the contract silently.
|
||||
func TestWrapPingError_NilReturnsNil(t *testing.T) {
|
||||
t.Parallel()
|
||||
if got := wrapPingError(nil); got != nil {
|
||||
t.Errorf("wrapPingError(nil) = %v, want nil", got)
|
||||
}
|
||||
}
|
||||
@@ -51,6 +51,15 @@ export interface CertificateVersion {
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
// G-2 (P1): `api_key_hash` is intentionally absent from this interface.
|
||||
// The server-side struct (internal/domain/connector.go::Agent) carries
|
||||
// the field for the auth-lookup path but redacts it via a custom
|
||||
// MarshalJSON so it never reaches the JSON wire. Adding `api_key_hash`
|
||||
// here would not magically populate it on the wire — and would mislead
|
||||
// future contributors into thinking the field is part of the public
|
||||
// API contract. See docs/architecture.md ER-diagram note and
|
||||
// coverage-gap-audit-2026-04-24-v5/unified-audit.md cat-s5-apikey_leak
|
||||
// for the closure rationale.
|
||||
export interface Agent {
|
||||
id: string;
|
||||
name: string;
|
||||
|
||||
Reference in New Issue
Block a user