mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-08 03:01:32 +00:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| aa6fafdee9 | |||
| 86fffa305a | |||
| e17788355b | |||
| 87213128cc | |||
| 697fa792ea | |||
| 9c1d446e40 | |||
| 3192cd15c5 | |||
| af47d19ae2 | |||
| cfc234ec42 | |||
| a91197014f |
+25
-4
@@ -13,22 +13,43 @@ POSTGRES_PASSWORD=change-me-in-production
|
||||
# Certctl Server
|
||||
# All server vars use the CERTCTL_ prefix (see internal/config/config.go)
|
||||
# ==============================================================================
|
||||
CERTCTL_DATABASE_URL=postgres://certctl:certctl@postgres:5432/certctl?sslmode=disable
|
||||
# IMPORTANT: keep the password segment of CERTCTL_DATABASE_URL in sync with
|
||||
# POSTGRES_PASSWORD above. If you deploy via `deploy/docker-compose.yml`,
|
||||
# this value is *overridden* by the compose file's
|
||||
# `postgres://certctl:${POSTGRES_PASSWORD:-certctl}@postgres:5432/...`
|
||||
# interpolation — but if you run the binary directly with this .env loaded
|
||||
# (e.g. `set -a; source .env; ./certctl-server`), update *both* lines.
|
||||
# Background: editing POSTGRES_PASSWORD after the postgres data directory
|
||||
# has been initialized once does NOT rotate the password — initdb only
|
||||
# seeds pg_authid on first boot of an empty volume. See docs/quickstart.md
|
||||
# "Warning" callout and `internal/repository/postgres/db.go::wrapPingError`
|
||||
# for the SQLSTATE 28P01 diagnostic that fires when the two drift.
|
||||
CERTCTL_DATABASE_URL=postgres://certctl:change-me-in-production@postgres:5432/certctl?sslmode=disable
|
||||
CERTCTL_SERVER_HOST=0.0.0.0
|
||||
CERTCTL_SERVER_PORT=8443
|
||||
CERTCTL_LOG_LEVEL=info
|
||||
CERTCTL_LOG_FORMAT=json
|
||||
|
||||
# Auth type: "api-key", "jwt", or "none" (for demo/development)
|
||||
# Auth type: "api-key" (production) or "none" (demo/development).
|
||||
# For JWT/OIDC, run an authenticating gateway in front of certctl
|
||||
# (oauth2-proxy / Envoy ext_authz / Traefik ForwardAuth / Pomerium) and
|
||||
# set CERTCTL_AUTH_TYPE=none on the upstream — see
|
||||
# docs/architecture.md "Authenticating-gateway pattern". G-1 removed
|
||||
# the in-process "jwt" option (no JWT middleware shipped — silent auth
|
||||
# downgrade); see docs/upgrade-to-v2-jwt-removal.md if you previously
|
||||
# set CERTCTL_AUTH_TYPE=jwt.
|
||||
CERTCTL_AUTH_TYPE=none
|
||||
# Required when CERTCTL_AUTH_TYPE is "api-key" or "jwt"
|
||||
# Required when CERTCTL_AUTH_TYPE is "api-key".
|
||||
# Generate with: openssl rand -base64 32
|
||||
# CERTCTL_AUTH_SECRET=change-me-in-production
|
||||
|
||||
# ==============================================================================
|
||||
# Certctl Agent
|
||||
# ==============================================================================
|
||||
CERTCTL_SERVER_URL=http://localhost:8443
|
||||
# HTTPS-only as of v2.2 (TLS 1.3 pinned). Agents reject http:// URLs at
|
||||
# startup. Use the docker-compose self-signed bootstrap CA bundle from
|
||||
# `deploy/test/certs/ca.crt` or supply your own via CERTCTL_SERVER_CA_BUNDLE_PATH.
|
||||
CERTCTL_SERVER_URL=https://localhost:8443
|
||||
CERTCTL_API_KEY=change-me-in-production
|
||||
CERTCTL_AGENT_NAME=local-agent
|
||||
|
||||
|
||||
@@ -44,6 +44,175 @@ jobs:
|
||||
- name: Run govulncheck
|
||||
run: govulncheck ./...
|
||||
|
||||
- name: Forbidden auth-type literal regression guard (G-1)
|
||||
# G-1 closed the JWT silent auth downgrade by removing "jwt" from the
|
||||
# accepted CERTCTL_AUTH_TYPE values. This step grep-fails the build
|
||||
# if "jwt" reappears in any of the *additive* auth-type surfaces:
|
||||
# the validAuthTypes / ValidAuthTypes() set, the OpenAPI enum, the
|
||||
# helm chart's allowed-types list, or the .env.example default.
|
||||
# Comment lines and the dedicated rejection branch in config.go
|
||||
# (`c.Auth.Type == "jwt"`) are intentionally exempt — those are the
|
||||
# G-1 fix itself, not a regression.
|
||||
#
|
||||
# Connector packages (internal/connector/) are exempt because the
|
||||
# Google OAuth2 service-account JWT and step-ca provisioner one-
|
||||
# time-token JWT are external-protocol uses, unrelated to certctl's
|
||||
# own auth shape. Test files (_test.go) are exempt so negative
|
||||
# tests can pass the literal.
|
||||
#
|
||||
# See docs/upgrade-to-v2-jwt-removal.md for the closure rationale,
|
||||
# or internal/config/config.go::ValidAuthTypes for the allowed set.
|
||||
run: |
|
||||
set -e
|
||||
|
||||
# Scoped patterns that indicate "jwt" being added back to an
|
||||
# allowed-set surface. Each catches a regression shape we've
|
||||
# actually seen in pre-G-1 code:
|
||||
# - Go map/slice literal: "jwt": true or "jwt",
|
||||
# - Go switch case: case "jwt"
|
||||
# - YAML enum: enum: [..., jwt, ...] or - jwt
|
||||
# - .env conditional: AUTH_TYPE.*"jwt"|=jwt$
|
||||
BAD=$(grep -rnEH \
|
||||
-e '"jwt"\s*:\s*true' \
|
||||
-e '"jwt"\s*,' \
|
||||
-e 'case\s+"jwt"' \
|
||||
-e 'enum:.*\bjwt\b' \
|
||||
-e '^\s*-\s*jwt\s*$' \
|
||||
-e 'AUTH_TYPE\s*=\s*jwt\s*$' \
|
||||
-e 'AUTH_TYPE\s*=\s*jwt\s*#' \
|
||||
-e 'auth\.type\s*=\s*jwt\s*$' \
|
||||
-e 'AuthType\("jwt"\)' \
|
||||
internal/config/ \
|
||||
internal/api/ \
|
||||
cmd/ \
|
||||
api/openapi.yaml \
|
||||
.env.example \
|
||||
deploy/.env.example \
|
||||
deploy/helm/certctl/values.yaml \
|
||||
deploy/helm/certctl/templates/ \
|
||||
2>/dev/null \
|
||||
| grep -v '_test.go' \
|
||||
| grep -vE '^\s*[^:]+:[0-9]+:\s*(//|#)' \
|
||||
| grep -v 'is no longer accepted' \
|
||||
|| true)
|
||||
if [ -n "$BAD" ]; then
|
||||
echo "G-1 regression: \"jwt\" reappeared in an allowed-set surface:"
|
||||
echo "$BAD"
|
||||
echo ""
|
||||
echo "Allowed surface for 'jwt' literals: comment lines, the"
|
||||
echo "dedicated rejection branch in internal/config/config.go,"
|
||||
echo "and connector packages (Google OAuth2, step-ca)."
|
||||
echo "See docs/upgrade-to-v2-jwt-removal.md and"
|
||||
echo "internal/config/config.go::ValidAuthTypes()."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Forbidden api_key_hash JSON-shape regression guard (G-2)
|
||||
# G-2 closed cat-s5-apikey_leak by tagging Agent.APIKeyHash
|
||||
# `json:"-"` and adding a defense-in-depth Agent.MarshalJSON that
|
||||
# zeroes the field on the marshal-time copy. This step grep-fails
|
||||
# the build if `api_key_hash` reappears in any of the *additive*
|
||||
# JSON-emitting surfaces: a Go struct json tag in internal/domain/,
|
||||
# an OpenAPI Agent schema property, a TypeScript field declaration
|
||||
# in web/src/, or an enum-list / discriminator in handler
|
||||
# production code.
|
||||
#
|
||||
# Repository, migration, seed, service, integration-test, and
|
||||
# unit-test files are exempt — those are server-internal use
|
||||
# sites (the DB column stays, the in-memory struct field stays,
|
||||
# the auth-lookup path stays). Comment lines are exempt so the
|
||||
# G-2 closure rationale can stay in the source.
|
||||
#
|
||||
# See coverage-gap-audit-2026-04-24-v5/unified-audit.md
|
||||
# cat-s5-apikey_leak for the closure rationale, or
|
||||
# internal/domain/connector.go::Agent::MarshalJSON for the
|
||||
# redaction enforcement.
|
||||
run: |
|
||||
set -e
|
||||
|
||||
# Scoped patterns that indicate api_key_hash being added back
|
||||
# to a JSON-emitting surface. Each catches a regression shape
|
||||
# that pre-G-2 actually shipped or that a future refactor
|
||||
# could plausibly introduce:
|
||||
# - Go struct tag: `json:"api_key_hash"`
|
||||
# - Frontend interface: api_key_hash[?]: string
|
||||
# - OpenAPI schema property: api_key_hash: (column-aligned)
|
||||
# - YAML enum / array: - api_key_hash
|
||||
BAD=$(grep -rnEH \
|
||||
-e 'json:"api_key_hash[",]' \
|
||||
-e '^\s*api_key_hash\??\s*:' \
|
||||
-e '^\s*-\s*api_key_hash\s*$' \
|
||||
internal/domain/ \
|
||||
internal/api/ \
|
||||
cmd/ \
|
||||
api/openapi.yaml \
|
||||
web/src/ \
|
||||
2>/dev/null \
|
||||
| grep -v '_test.go' \
|
||||
| grep -vE '^\s*[^:]+:[0-9]+:\s*(//|#)' \
|
||||
|| true)
|
||||
if [ -n "$BAD" ]; then
|
||||
echo "G-2 regression: api_key_hash reappeared in a JSON-emitting surface:"
|
||||
echo "$BAD"
|
||||
echo ""
|
||||
echo "Allowed surface for api_key_hash literals: comment lines,"
|
||||
echo "the database column (migrations/), the in-memory struct"
|
||||
echo "field tagged \`json:\"-\"\`, and the repository / service"
|
||||
echo "use sites. See internal/domain/connector.go::Agent and"
|
||||
echo "coverage-gap-audit-2026-04-24-v5/unified-audit.md"
|
||||
echo "cat-s5-apikey_leak for the closure rationale."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Forbidden plaintext HEALTHCHECK regression guard (U-2)
|
||||
# U-2 closed cat-u-healthcheck_protocol_mismatch by switching the
|
||||
# published image's HEALTHCHECK from `curl -f http://localhost:
|
||||
# 8443/health` (always failed against the HTTPS-only listener) to
|
||||
# `curl -fsk https://localhost:8443/health`. This step grep-fails
|
||||
# the build if any Dockerfile in the repo carries the pre-U-2
|
||||
# plaintext shape — either explicitly (`http://localhost:8443/
|
||||
# health` in a HEALTHCHECK) or via the looser pattern of any
|
||||
# HEALTHCHECK that targets `http://` against the certctl server
|
||||
# port.
|
||||
#
|
||||
# Comment lines and the docs/upgrade-to-tls.md:182 expected-to-
|
||||
# fail invariant ("plaintext is gone, expect Connection refused")
|
||||
# are intentionally exempt — we DO want the upgrade-doc string
|
||||
# `http://localhost:8443/health` to remain there, since it
|
||||
# documents what operators should test for to confirm plaintext
|
||||
# is dead. The guardrail is scoped to Dockerfile* only, so docs
|
||||
# are out of its reach.
|
||||
#
|
||||
# See coverage-gap-audit-2026-04-24-v5/unified-audit.md
|
||||
# cat-u-healthcheck_protocol_mismatch for the closure rationale,
|
||||
# or deploy/test/healthcheck_test.go for the binary-image
|
||||
# contract the runtime test pins.
|
||||
run: |
|
||||
set -e
|
||||
|
||||
# Patterns that catch the actual regression shapes:
|
||||
# - HEALTHCHECK directive carrying any http:// (even if the
|
||||
# port differs, no plaintext probe should ship).
|
||||
# - The exact pre-U-2 string for grep-friendliness.
|
||||
BAD=$(grep -rnEH \
|
||||
-e 'HEALTHCHECK.*http://' \
|
||||
-e 'curl[^|&;]*-f[^|&;]*http://localhost:8443/health' \
|
||||
Dockerfile Dockerfile.agent Dockerfile.* 2>/dev/null \
|
||||
| grep -vE '^\s*[^:]+:[0-9]+:\s*#' \
|
||||
|| true)
|
||||
if [ -n "$BAD" ]; then
|
||||
echo "U-2 regression: plaintext HEALTHCHECK reappeared in a Dockerfile:"
|
||||
echo "$BAD"
|
||||
echo ""
|
||||
echo "Allowed: HTTPS HEALTHCHECK with -k (acceptable for"
|
||||
echo "localhost-to-localhost), or non-HTTP probe shapes"
|
||||
echo "(pgrep, /proc check). See Dockerfile / Dockerfile.agent"
|
||||
echo "for the post-U-2 reference shape and"
|
||||
echo "coverage-gap-audit-2026-04-24-v5/unified-audit.md"
|
||||
echo "cat-u-healthcheck_protocol_mismatch for rationale."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Race Detection
|
||||
run: go test -race ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/scheduler/... ./internal/connector/... ./internal/crypto/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -timeout 300s
|
||||
|
||||
|
||||
@@ -2,6 +2,63 @@
|
||||
|
||||
All notable changes to certctl are documented in this file. Dates use ISO 8601. Versions follow [Semantic Versioning](https://semver.org/).
|
||||
|
||||
## [unreleased] — 2026-04-24
|
||||
|
||||
### G-1: JWT silent auth downgrade — closed end-to-end
|
||||
|
||||
> Pre-G-1 the config validator accepted `CERTCTL_AUTH_TYPE=jwt` and the startup log faithfully echoed `"authentication enabled" "type"="jwt"`. Reasonable people read that and concluded JWT was on. It wasn't. The auth-middleware wiring at `cmd/server/main.go` unconditionally routed every request through the api-key bearer middleware regardless of `cfg.Auth.Type`. So `CERTCTL_AUTH_TYPE=jwt` quietly compared incoming `Authorization: Bearer <something>` against whatever string the operator put in `CERTCTL_AUTH_SECRET` — real JWT clients got 401, and operators who treated `CERTCTL_AUTH_SECRET` as a *signing* secret (because they thought they were configuring JWT) had effectively handed an attacker an api-key. A security finding masquerading as a config option. We chose to remove the option rather than ship JWT middleware — the audit-recommended structural fix that closes the hazard. Operators who actually need JWT/OIDC front certctl with an authenticating gateway (oauth2-proxy / Envoy `ext_authz` / Traefik `ForwardAuth` / Pomerium / Authelia) and run the upstream certctl with `CERTCTL_AUTH_TYPE=none`. The same pattern works on docker-compose and Helm.
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
- **`CERTCTL_AUTH_TYPE=jwt` is no longer accepted.** Pre-G-1 the value was silently downgraded to api-key middleware. Post-G-1 the server fails at startup with a dedicated diagnostic naming the authenticating-gateway pattern. Operators with this in their env block must either switch to `api-key` (if they were de facto using api-key auth all along — same Bearer token continues to work) or switch to `none` and front certctl with an oauth2-proxy / Envoy / Traefik / Pomerium gateway. See [`docs/upgrade-to-v2-jwt-removal.md`](docs/upgrade-to-v2-jwt-removal.md).
|
||||
- **Helm chart `server.auth.type=jwt` now fails at `helm install` / `helm upgrade` template time.** New `certctl.validateAuthType` template helper runs on every template that depends on `.Values.server.auth.type` (`server-deployment.yaml`, `server-configmap.yaml`, `server-secret.yaml`) and fails the render with a pointer at the gateway-fronting pattern.
|
||||
- **OpenAPI spec `auth_type` enum no longer includes `jwt`.** API consumers checking `/api/v1/auth/info` against the spec will see a smaller enum.
|
||||
|
||||
### Removed
|
||||
|
||||
- Documented references to JWT in the certctl auth surface (config docblocks, middleware/health-handler comments, `.env.example`, `docs/architecture.md` middleware-stack bullet). Connector-level JWT references (Google OAuth2 service-account JWT in `internal/connector/discovery/gcpsm/`, `internal/connector/issuer/googlecas/`; step-ca's provisioner one-time-token JWT in `internal/connector/issuer/stepca/`) are unrelated and untouched — those are external-protocol uses, not certctl's own auth shape.
|
||||
|
||||
### Added
|
||||
|
||||
- **`config.AuthType` typed alias** with `AuthTypeAPIKey` / `AuthTypeNone` exported constants. Single source of truth for the allowed set across the validator, the runtime defense-in-depth switch in `main.go`, and the helm chart's `validateAuthType` helper.
|
||||
- **`config.ValidAuthTypes()`** helper returning the complete allowed set; pinned by a property test (`TestValidAuthTypesDoesNotContainJWT`) that fails the build if `"jwt"` is ever re-added to the slice.
|
||||
- **Defense-in-depth runtime guard** in `cmd/server/main.go` immediately after `config.Load()` — a `switch config.AuthType(cfg.Auth.Type)` that exits 1 if the validator was bypassed (test harness, alt config loader, env-var rebinding).
|
||||
- **`certctl.validateAuthType` Helm template helper** mirroring the existing `certctl.tls.required` pattern. Fails template render on any `server.auth.type` outside `{api-key, none}`.
|
||||
- **`docs/architecture.md` "Authenticating-gateway pattern (JWT, OIDC, mTLS)"** section explaining the design rationale for the narrow in-process auth surface and listing oauth2-proxy / Envoy `ext_authz` / Traefik `ForwardAuth` / Pomerium / Authelia / Caddy `forward_auth` / Apache `mod_auth_openidc` / nginx `auth_request` as the standard fronting options.
|
||||
- **`docs/upgrade-to-v2-jwt-removal.md`** migration guide. Same shape as `docs/upgrade-to-tls.md`. Walks through the dedicated startup error, both recovery paths (`api-key` vs gateway-fronting), a complete docker-compose oauth2-proxy walkthrough, Traefik ForwardAuth and Envoy `ext_authz` patterns, and rollback posture.
|
||||
- **`deploy/helm/certctl/README.md`** "JWT / OIDC via authenticating gateway" section with a Kubernetes-flavored oauth2-proxy + certctl walkthrough.
|
||||
- **CI regression guardrail** in `.github/workflows/ci.yml` (`Forbidden auth-type literal regression guard (G-1)`) — grep-fails the build if `"jwt"` appears as an auth-type literal in production code or spec. Connector packages exempt (legitimate external-protocol uses).
|
||||
- **Negative test coverage** in `internal/config/config_test.go`: `TestValidate_JWTAuth_RejectedDedicated` (two table rows pinning that the dedicated G-1 error fires regardless of whether `Secret` is set), `TestValidAuthTypesDoesNotContainJWT` (property-level guard), `TestValidAuthTypesIsExactly_APIKey_None` (allowed-set contract), `TestValidate_GenericInvalidAuthType` (pins that other invalid values still surface the generic invalid-auth-type error, so the dedicated G-1 path doesn't accidentally swallow non-jwt typos).
|
||||
|
||||
### Changed
|
||||
|
||||
- `internal/api/middleware/middleware.go::AuthConfig.Type` field comment now references the typed `config.AuthType` constants instead of an inline string enumeration.
|
||||
- `internal/api/handler/health.go::HealthHandler.AuthType` field comment same treatment.
|
||||
- `internal/api/handler/health_test.go` — the prior `TestAuthInfo_ReturnsAuthType_JWT` (which asserted the handler echoed `"jwt"`, baking the silent-downgrade lie into the regression suite) is removed; the pre-existing `TestAuthInfo_ReturnsAuthType_APIKey` continues to cover the api-key happy path.
|
||||
- Auth-disabled startup log in `main.go` now points operators at the authenticating-gateway pattern explicitly.
|
||||
|
||||
### U-2: Dockerfile HEALTHCHECK protocol mismatch — closed end-to-end
|
||||
|
||||
> Pre-U-2 the published `ghcr.io/shankar0123/certctl-server` image shipped with `HEALTHCHECK CMD curl -f http://localhost:8443/health`. The server has been HTTPS-only since the v2.2 HTTPS-Everywhere milestone (`cmd/server/main.go::ListenAndServeTLS`, no plaintext fallback, TLS 1.3 pinned), so the probe failed every interval and Docker marked the container `unhealthy` indefinitely. Operators inside docker-compose / Helm / the example stacks were unaffected — compose overrides the HEALTHCHECK with `--cacert + https://`, Helm uses explicit `httpGet` probes that ignore Docker's HEALTHCHECK, and every example compose file overrides with `curl -sfk https://localhost:8443/health`. But anyone running bare `docker run` / Docker Swarm / Nomad / ECS — exactly the "I just pulled the published image" path — saw permanent `unhealthy` status and (depending on orchestrator policy) a restart-loop. Recon for U-2 also surfaced two adjacent bugs from the same v2.2 milestone gap: the Helm chart's `readinessProbe.httpGet.path` pointed at `/readyz`, a route the server doesn't register (only `/health` and `/ready` are wired and bypass the auth middleware), so K8s readiness probes were getting 404/auth-rejection and pods stayed `NotReady`; and the agent image had no HEALTHCHECK at all (the compose override called `pgrep -f certctl-agent` against an image that didn't ship `procps` — latent always-fail). All three are closed in this commit.
|
||||
|
||||
### Fixed
|
||||
|
||||
- **`Dockerfile` HEALTHCHECK now speaks HTTPS.** Bare `docker run` / Swarm / Nomad / ECS users no longer see `unhealthy` forever. The probe uses `curl -fsk https://localhost:8443/health` — `-k` (insecure) is acceptable because the probe is localhost-to-localhost: the same process serving the cert is being probed; the probe never traverses a network. Compose / Helm / examples already perform full cert-chain validation and are unaffected.
|
||||
- **Helm `server.readinessProbe.httpGet.path` corrected from `/readyz` to `/ready`.** The `/readyz` path was never registered as a no-auth route (see `internal/api/router/router.go:81` and `cmd/server/main.go:920`), so K8s readiness probes received 401 (api-key auth rejection) or 404 (when auth was disabled). Pods previously failed to report Ready under most realistic Helm deployments. Liveness probe path (`/health`) was already correct and is unchanged.
|
||||
- **`docs/connectors.md` curl examples** (15 sites) updated from `http://localhost:8443/...` to `https://localhost:8443/...` with a one-time `--cacert "$CA"` extraction note matching the existing pattern in `docs/quickstart.md`. Pre-U-2 these examples silently failed against the HTTPS listener.
|
||||
|
||||
### Added
|
||||
|
||||
- **`Dockerfile.agent` HEALTHCHECK** — `pgrep -f certctl-agent` process-presence check (the agent has no HTTP listener; presence is the right primitive). Bare-`docker run` agents now report health-status the same way compose-managed ones do. Also adds `procps` to the runtime image so `pgrep` is actually available — pre-U-2 the docker-compose override at `deploy/docker-compose.yml:173` called `pgrep -f certctl-agent` against an image that lacked it (latent always-fail; container was reported unhealthy in compose too, just rarely noticed because nothing acted on the signal).
|
||||
- **`deploy/test/healthcheck_test.go`** (`//go:build integration`) — image-level integration tests. `TestPublishedServerImage_HealthcheckSpecUsesHTTPS` builds the server image, inspects `Config.Healthcheck.Test` via `docker inspect`, and asserts the array contains `https://localhost:8443/health` and `-k`, and does NOT contain `http://localhost:8443/health` (negative regression contract). `TestPublishedAgentImage_HealthcheckSpecExists` builds the agent image and asserts the HEALTHCHECK uses `pgrep` against `certctl-agent`. Both tests `t.Skip` cleanly when docker isn't available (sandbox / CI without docker-in-docker). A third runtime test (`TestPublishedServerImage_HealthcheckTransitionsToHealthy`) is a `t.Skip` placeholder until the harness wires a sidecar postgres for image-level smoke — documented honestly so the next refactor adopts it instead of rediscovering the gap.
|
||||
- **CI regression guardrail** in `.github/workflows/ci.yml` (`Forbidden plaintext HEALTHCHECK regression guard (U-2)`) — grep-fails the build if any `Dockerfile*` carries `HEALTHCHECK.*http://` or `curl -f http://localhost:8443/health`. Comments exempt; the `docs/upgrade-to-tls.md:182` post-cutover invariant string (which deliberately documents the expected-failure shape) is out of the guardrail's scope because the guardrail only scans Dockerfiles.
|
||||
|
||||
### Changed
|
||||
|
||||
- `Dockerfile` final-stage HEALTHCHECK lines now carry a long-form docblock explaining the `-k` design choice, the published-image vs compose vs Helm vs examples coverage matrix, and cross-references to the audit closure + the integration test.
|
||||
- `Dockerfile.agent` runtime stage adds `procps` to the apk install so the new HEALTHCHECK and the existing compose override both have a working `pgrep`.
|
||||
- `deploy/helm/certctl/values.yaml` server probes block now carries an explanatory comment naming the registered probe routes (`/health`, `/ready`) and the U-2 closure rationale for the `/readyz` → `/ready` correction.
|
||||
|
||||
## [2.2.0] — 2026-04-19
|
||||
|
||||
### HTTPS Everywhere — The Irony
|
||||
|
||||
+28
-1
@@ -76,7 +76,34 @@ USER certctl
|
||||
|
||||
EXPOSE 8443
|
||||
|
||||
# Image-level HEALTHCHECK for bare `docker run` / Docker Swarm / Nomad / ECS.
|
||||
#
|
||||
# U-2 (P1, cat-u-healthcheck_protocol_mismatch): pre-U-2 this probe used
|
||||
# `curl -f http://localhost:8443/health`, which always failed against the
|
||||
# HTTPS-only listener (HTTPS-Everywhere milestone, v2.2 / tag v2.0.47 —
|
||||
# `cmd/server/main.go::ListenAndServeTLS`, no plaintext fallback, TLS 1.3
|
||||
# pinned). Operators outside docker-compose / Helm saw permanent
|
||||
# `unhealthy` status and a restart-loop the first time they pulled the
|
||||
# image. The compose stack overrides this HEALTHCHECK with `--cacert` to
|
||||
# the bootstrap CA bundle (deploy/docker-compose.yml:126); the Helm chart
|
||||
# uses explicit `httpGet` probes with `scheme: HTTPS` and ignores Docker's
|
||||
# HEALTHCHECK; every example compose file in `examples/*/docker-compose.yml`
|
||||
# overrides with `curl -sfk https://localhost:8443/health`. This image-
|
||||
# level probe is for the bare-`docker run` consumer ONLY.
|
||||
#
|
||||
# `-k` (insecure) is acceptable here because the probe is localhost-to-
|
||||
# localhost: the same process serving the cert is being probed; the probe
|
||||
# never traverses a network. Pinning a `--cacert` is not viable for the
|
||||
# published image because the bootstrap cert is per-deploy (generated into
|
||||
# the `certs` named volume on first up; operator-supplied via Helm's
|
||||
# `existingSecret` or cert-manager). Compose / Helm / examples already
|
||||
# perform full cert-chain validation and are unaffected.
|
||||
#
|
||||
# CI grep guardrail at .github/workflows/ci.yml ("Forbidden plaintext
|
||||
# HEALTHCHECK regression guard (U-2)") blocks reintroduction of the
|
||||
# `http://` shape. Image-level integration test in
|
||||
# deploy/test/healthcheck_test.go pins the contract end-to-end.
|
||||
HEALTHCHECK --interval=10s --timeout=5s --start-period=5s --retries=5 \
|
||||
CMD curl -f http://localhost:8443/health || exit 1
|
||||
CMD curl -fsk https://localhost:8443/health || exit 1
|
||||
|
||||
ENTRYPOINT ["/app/server"]
|
||||
|
||||
+23
-1
@@ -36,7 +36,14 @@ RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build \
|
||||
# Stage 2: Runtime
|
||||
FROM alpine:3.19
|
||||
|
||||
RUN apk add --no-cache ca-certificates curl
|
||||
# U-2: `procps` ships pgrep, which the HEALTHCHECK below uses to verify the
|
||||
# agent process is alive. Pre-U-2 the deploy/docker-compose.yml agent
|
||||
# HEALTHCHECK called `pgrep -f certctl-agent` against this image but
|
||||
# pgrep wasn't installed — the compose probe was a latent always-fail.
|
||||
# Adding procps here fixes both the new image-level HEALTHCHECK and the
|
||||
# pre-existing compose override. Adds ~250KB to the image; acceptable for
|
||||
# observability parity with the server image.
|
||||
RUN apk add --no-cache ca-certificates curl procps
|
||||
|
||||
RUN addgroup -g 1000 certctl && \
|
||||
adduser -D -u 1000 -G certctl certctl
|
||||
@@ -51,4 +58,19 @@ RUN mkdir -p /var/lib/certctl/keys && \
|
||||
|
||||
USER certctl
|
||||
|
||||
# Image-level HEALTHCHECK for bare `docker run` / Docker Swarm / Nomad / ECS.
|
||||
#
|
||||
# U-2 (P1, cat-u-healthcheck_protocol_mismatch — adjacent fix): the agent
|
||||
# has no HTTP listener (it polls the server via outbound HTTPS), so a
|
||||
# process-presence check is the correct primitive. Pre-U-2 the agent image
|
||||
# shipped with no HEALTHCHECK at all, so bare-`docker run` operators got
|
||||
# zero health signal and orchestrators that key off Docker's HEALTHCHECK
|
||||
# (Swarm, Nomad, ECS) saw the container reported as `none`. The compose
|
||||
# override at deploy/docker-compose.yml:173 used the same `pgrep -f
|
||||
# certctl-agent` shape; we mirror it here so the published image has
|
||||
# parity with the compose stack and the override on docker-compose.yml
|
||||
# becomes redundant-but-correct rather than load-bearing.
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
||||
CMD pgrep -f certctl-agent > /dev/null || exit 1
|
||||
|
||||
ENTRYPOINT ["/app/agent"]
|
||||
|
||||
+20
-3
@@ -132,7 +132,14 @@ paths:
|
||||
properties:
|
||||
auth_type:
|
||||
type: string
|
||||
enum: [api-key, jwt, none]
|
||||
# G-1 (P1): "jwt" removed from this enum after the silent
|
||||
# auth downgrade was identified — no JWT middleware ships
|
||||
# with certctl. Operators who need JWT/OIDC front certctl
|
||||
# with an authenticating gateway (oauth2-proxy / Envoy /
|
||||
# Traefik / Pomerium) and set CERTCTL_AUTH_TYPE=none
|
||||
# upstream. See docs/architecture.md "Authenticating-
|
||||
# gateway pattern".
|
||||
enum: [api-key, none]
|
||||
required:
|
||||
type: boolean
|
||||
|
||||
@@ -3680,8 +3687,18 @@ components:
|
||||
registered_at:
|
||||
type: string
|
||||
format: date-time
|
||||
api_key_hash:
|
||||
type: string
|
||||
# G-2 (P1): the `api_key_hash` field was REMOVED from this
|
||||
# schema after cat-s5-apikey_leak audit closure. The DB column
|
||||
# still exists (migrations/000001_initial_schema.up.sql) and
|
||||
# the server still populates the in-memory struct for the
|
||||
# auth-lookup path (repository.AgentRepository::GetByAPIKey),
|
||||
# but the JSON wire shape no longer carries it — see
|
||||
# internal/domain/connector.go::Agent::APIKeyHash + MarshalJSON
|
||||
# for the redaction enforcement and docs/architecture.md ER
|
||||
# diagram for the database-vs-API distinction. Do NOT re-add
|
||||
# the field here without first removing the JSON-shape redaction
|
||||
# in the domain package; the CI guardrail at
|
||||
# .github/workflows/ci.yml will block re-introduction either way.
|
||||
os:
|
||||
type: string
|
||||
architecture:
|
||||
|
||||
+23
-3
@@ -39,6 +39,26 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Defense-in-depth runtime guard for the auth-type discriminator.
|
||||
//
|
||||
// G-1 (P1): config.Load() already runs Validate() which rejects "jwt"
|
||||
// and any value outside config.ValidAuthTypes() with a dedicated
|
||||
// diagnostic. This switch is belt-and-braces — if a future refactor
|
||||
// bypasses the validator (test harness, alt config loader, env-var
|
||||
// rebinding after Load) the server must not silently boot with an
|
||||
// unsupported auth shape. The error path uses fmt.Fprintf because
|
||||
// the slog logger is constructed from cfg below this point; we want
|
||||
// the failure to be visible regardless of log-level configuration.
|
||||
switch config.AuthType(cfg.Auth.Type) {
|
||||
case config.AuthTypeAPIKey, config.AuthTypeNone:
|
||||
// ok — fall through
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"unsupported auth type at runtime: %q (valid: %v) — config validation should have caught this; refusing to start\n",
|
||||
cfg.Auth.Type, config.ValidAuthTypes())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Set up structured logging
|
||||
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||
Level: cfg.GetLogLevel(),
|
||||
@@ -615,7 +635,7 @@ func main() {
|
||||
// compatibility CERTCTL_AUTH_SECRET is synthesized into legacy-key-N
|
||||
// entries with Admin=false.
|
||||
var namedKeys []middleware.NamedAPIKey
|
||||
if cfg.Auth.Type != "none" {
|
||||
if config.AuthType(cfg.Auth.Type) != config.AuthTypeNone {
|
||||
// Translate typed config.NamedAPIKey -> middleware.NamedAPIKey. The
|
||||
// two structs are field-compatible but live in different packages to
|
||||
// preserve the config→middleware dependency direction.
|
||||
@@ -704,8 +724,8 @@ func main() {
|
||||
logger.Info("rate limiting enabled", "rps", cfg.RateLimit.RPS, "burst", cfg.RateLimit.BurstSize)
|
||||
}
|
||||
|
||||
if cfg.Auth.Type == "none" {
|
||||
logger.Warn("authentication disabled (CERTCTL_AUTH_TYPE=none) — not suitable for production")
|
||||
if config.AuthType(cfg.Auth.Type) == config.AuthTypeNone {
|
||||
logger.Warn("authentication disabled (CERTCTL_AUTH_TYPE=none) — not suitable for production except behind an authenticating gateway (oauth2-proxy / Envoy ext_authz / Traefik ForwardAuth / Pomerium)")
|
||||
} else {
|
||||
logger.Info("authentication enabled", "type", cfg.Auth.Type)
|
||||
}
|
||||
|
||||
@@ -122,6 +122,8 @@ The `volumes` section mounts 10 migration files into PostgreSQL's init directory
|
||||
|
||||
**Expert note:** The numbered prefix pattern (`001_`, `002_`, ..., `020_`) ensures deterministic execution order. All migrations use `IF NOT EXISTS` and `ON CONFLICT DO NOTHING` for idempotency, so re-running them against an existing database is safe.
|
||||
|
||||
**Stateful volume — first-boot password binding (U-1).** The same "first boot only" semantics that govern migration scripts also govern `POSTGRES_PASSWORD`. The official `postgres` image runs `initdb` exactly once — when `/var/lib/postgresql/data` is empty — and that pass is the only time `POSTGRES_PASSWORD` is written into `pg_authid`. On every subsequent boot, the postgres container ignores the env var and authenticates against whatever password was baked into the data directory on the original `up`. Editing `POSTGRES_PASSWORD` in `.env` after a successful first boot therefore only updates the **certctl-server** container's `CERTCTL_DATABASE_URL` — postgres still expects the previous password, and the server fails to ping with `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). The certctl-server container surfaces this case explicitly: when SQLSTATE 28P01 fires at startup, the wrap text in `internal/repository/postgres/db.go::wrapPingError` points operators at the two remediation paths — destructive volume teardown via `docker compose -f deploy/docker-compose.yml down -v && up -d --build`, or non-destructive in-place rotation via `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"` followed by a server restart with the matching `POSTGRES_PASSWORD`. Use the destructive path on the demo / first-time setup; use the non-destructive path on any environment that holds data you want to keep.
|
||||
|
||||
#### certctl Server
|
||||
|
||||
```yaml
|
||||
|
||||
@@ -246,8 +246,8 @@ helm install certctl certctl/ \
|
||||
|--------|---------|-------------|
|
||||
| `server.replicas` | 1 | Number of server replicas |
|
||||
| `server.port` | 8443 | Server port |
|
||||
| `server.auth.type` | api-key | Authentication type |
|
||||
| `server.auth.apiKey` | "" | API key (REQUIRED) |
|
||||
| `server.auth.type` | api-key | Authentication type — `api-key` or `none` (G-1: `jwt` removed; for JWT/OIDC use a fronting authenticating gateway, see `docs/architecture.md` and `docs/upgrade-to-v2-jwt-removal.md`) |
|
||||
| `server.auth.apiKey` | "" | API key (REQUIRED when `auth.type=api-key`) |
|
||||
| `server.logging.level` | info | Log level |
|
||||
| `server.logging.format` | json | Log format |
|
||||
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
# certctl Helm Chart
|
||||
|
||||
Production-ready Helm chart for deploying [certctl](https://github.com/shankar0123/certctl) on Kubernetes. Wires up the certctl server (Deployment), PostgreSQL (StatefulSet with PVC), and the agent (DaemonSet — one per node) on a private cluster, with health probes, security contexts, and optional Ingress.
|
||||
|
||||
## Quick install
|
||||
|
||||
```bash
|
||||
helm install certctl deploy/helm/certctl/ \
|
||||
--create-namespace --namespace certctl \
|
||||
--set server.auth.apiKey="$(openssl rand -base64 32)" \
|
||||
--set postgresql.auth.password="$(openssl rand -base64 24)"
|
||||
```
|
||||
|
||||
This brings up:
|
||||
|
||||
- `<release>-server` Deployment (HTTPS-only on port 8443; TLS 1.3)
|
||||
- `<release>-postgres` StatefulSet (PostgreSQL 16-alpine, 1 replica, 10Gi PVC by default)
|
||||
- `<release>-agent` DaemonSet (polls server, generates ECDSA P-256 keys locally)
|
||||
- Service objects, optional Ingress, and ServiceAccount with RBAC
|
||||
|
||||
See [`values.yaml`](values.yaml) for the full configuration surface — issuer settings, target connectors, scheduler intervals, notifier credentials, and resource requests/limits all live there.
|
||||
|
||||
## Operational notes
|
||||
|
||||
### Postgres password rotation — read this before changing `postgresql.auth.password`
|
||||
|
||||
**The trap.** `postgresql.auth.password` is bound to `pg_authid` exactly once — when the StatefulSet's PVC is provisioned and `initdb` runs. The official `postgres:16-alpine` image only runs `initdb` when `/var/lib/postgresql/data` is empty, so on every subsequent rollout the `POSTGRES_PASSWORD` env var is read into the container but **ignored** by postgres itself. The certctl-server container also picks up the new value (via the database URL helper template), so the two halves diverge: server presents the new password, postgres still expects the old one.
|
||||
|
||||
**Symptom.** The certctl-server pod's startup log shows:
|
||||
|
||||
```
|
||||
failed to ping database: postgres rejected the configured credentials
|
||||
(SQLSTATE 28P01 — invalid_password). If you recently rotated POSTGRES_PASSWORD ...
|
||||
```
|
||||
|
||||
That diagnostic is emitted by `internal/repository/postgres/db.go::wrapPingError` — it points operators at the two remediation paths below.
|
||||
|
||||
**Remediation, non-destructive (preferred for any environment with real data):**
|
||||
|
||||
```bash
|
||||
# 1. Rotate the password in postgres directly
|
||||
kubectl -n certctl exec -it <release>-postgres-0 -- \
|
||||
psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new-password>';"
|
||||
|
||||
# 2. Update the secret / Helm values to the same value
|
||||
helm upgrade <release> deploy/helm/certctl/ \
|
||||
--reuse-values \
|
||||
--set postgresql.auth.password='<new-password>'
|
||||
|
||||
# 3. Bounce the certctl-server pod so it re-reads the secret
|
||||
kubectl -n certctl rollout restart deployment/<release>-server
|
||||
```
|
||||
|
||||
**Remediation, destructive (DESTROYS ALL CERTCTL DATA — only acceptable on dev/demo clusters):**
|
||||
|
||||
```bash
|
||||
helm uninstall <release> -n certctl
|
||||
kubectl -n certctl delete pvc -l \
|
||||
app.kubernetes.io/name=certctl,app.kubernetes.io/component=postgres
|
||||
helm install <release> deploy/helm/certctl/ \
|
||||
--namespace certctl \
|
||||
--set postgresql.auth.password='<new-password>'
|
||||
```
|
||||
|
||||
The PVC re-creates empty, `initdb` runs on first boot of the new postgres pod, and `pg_authid` is seeded with the new password.
|
||||
|
||||
**Why we don't fix this in the chart.** The env-vs-`pg_authid` divergence is intrinsic to how the upstream `postgres` image bootstraps — `initdb` is run-once-per-empty-data-dir, and there is no upstream-supported way to make subsequent boots re-seed `pg_authid` from `POSTGRES_PASSWORD`. The ergonomic answer is the runtime diagnostic plus this operational note.
|
||||
|
||||
**Cross-references.** Same root cause is documented for the docker-compose path in [`docs/quickstart.md`](../../../docs/quickstart.md) (Warning callout after the `cp .env.example .env` block) and in [`deploy/ENVIRONMENTS.md`](../../ENVIRONMENTS.md) (Stateful volume — first-boot password binding section). The runtime diagnostic itself lives in `internal/repository/postgres/db.go::wrapPingError` with regression coverage in `internal/repository/postgres/db_test.go`.
|
||||
|
||||
### Server API key rotation
|
||||
|
||||
Unlike the postgres password, `server.auth.apiKey` accepts a comma-separated list, so zero-downtime rotation is straightforward:
|
||||
|
||||
```bash
|
||||
# 1. Add the new key alongside the old
|
||||
helm upgrade <release> deploy/helm/certctl/ \
|
||||
--reuse-values \
|
||||
--set server.auth.apiKey='new-key,old-key'
|
||||
|
||||
# 2. Roll your agents / clients over to the new key
|
||||
|
||||
# 3. Remove the old key
|
||||
helm upgrade <release> deploy/helm/certctl/ \
|
||||
--reuse-values \
|
||||
--set server.auth.apiKey='new-key'
|
||||
```
|
||||
|
||||
### JWT / OIDC via authenticating gateway
|
||||
|
||||
certctl's in-process auth surface is intentionally narrow: `server.auth.type=api-key` for production deployments and `server.auth.type=none` for development. There is no in-process JWT, OIDC, mTLS, or SAML middleware. (`server.auth.type=jwt` was accepted pre-G-1 but silently routed every request through the api-key bearer middleware — silent auth downgrade. The chart now fails at `helm install`/`helm upgrade` template time via the `certctl.validateAuthType` helper if you set it. See [`../../../docs/upgrade-to-v2-jwt-removal.md`](../../../docs/upgrade-to-v2-jwt-removal.md) if you previously had this in your values.)
|
||||
|
||||
For deployments that need JWT/OIDC, the canonical Kubernetes-flavored shape is to put oauth2-proxy in front of the certctl Service, attach an authenticating Ingress middleware, and run certctl with `server.auth.type=none`:
|
||||
|
||||
```bash
|
||||
# 1. Install oauth2-proxy (or any OIDC-terminating sidecar) in the same namespace
|
||||
helm install oauth2-proxy oauth2-proxy/oauth2-proxy \
|
||||
--namespace certctl \
|
||||
--set config.clientID="$OIDC_CLIENT_ID" \
|
||||
--set config.clientSecret="$OIDC_CLIENT_SECRET" \
|
||||
--set config.cookieSecret="$(openssl rand -base64 32)" \
|
||||
--set config.configFile='|
|
||||
provider = "oidc"
|
||||
oidc_issuer_url = "https://your-issuer/"
|
||||
upstreams = ["http://<release>-server.certctl.svc.cluster.local:8443"]
|
||||
pass_authorization_header = true
|
||||
set_authorization_header = true
|
||||
email_domains = ["*"]
|
||||
'
|
||||
|
||||
# 2. Install certctl with type=none (gateway terminates auth)
|
||||
helm install certctl deploy/helm/certctl/ \
|
||||
--namespace certctl \
|
||||
--set server.auth.type=none \
|
||||
--set postgresql.auth.password="$(openssl rand -base64 24)"
|
||||
|
||||
# 3. Attach an Ingress that routes through oauth2-proxy
|
||||
# (Traefik ForwardAuth, nginx auth_request, Envoy ext_authz, etc.)
|
||||
```
|
||||
|
||||
Same root pattern works with Pomerium, Authelia, Caddy `forward_auth`, Apache `mod_auth_openidc`, or any service-mesh `ext_authz`. See [`../../../docs/architecture.md`](../../../docs/architecture.md) "Authenticating-gateway pattern" for the full design rationale and [`../../../docs/upgrade-to-v2-jwt-removal.md`](../../../docs/upgrade-to-v2-jwt-removal.md) for the migration walkthrough.
|
||||
|
||||
### TLS certificate sourcing
|
||||
|
||||
By default the chart provisions a self-signed cert via the same init-container pattern as the docker-compose deploy. For production, supply an operator-managed Secret (cert-manager, internal CA, etc.) — see [`docs/tls.md`](../../../docs/tls.md) for the full provisioning matrix and [`docs/upgrade-to-tls.md`](../../../docs/upgrade-to-tls.md) for upgrade-from-HTTP procedures.
|
||||
|
||||
## Disabling embedded postgres
|
||||
|
||||
If you have an existing PostgreSQL cluster, disable the embedded one and point at it directly:
|
||||
|
||||
```bash
|
||||
helm install certctl deploy/helm/certctl/ \
|
||||
--set postgresql.enabled=false \
|
||||
--set server.databaseUrl='postgres://certctl:<pw>@my-pg-host:5432/certctl?sslmode=require'
|
||||
```
|
||||
|
||||
The volume-trap section above does **not** apply to this configuration — your postgres operator (or cloud DB) handles password rotation, and you control `pg_authid` directly.
|
||||
|
||||
## Uninstall
|
||||
|
||||
```bash
|
||||
helm uninstall <release> -n certctl
|
||||
# Optional — also delete the postgres PVC (DESTROYS DATA):
|
||||
kubectl -n certctl delete pvc -l \
|
||||
app.kubernetes.io/name=certctl,app.kubernetes.io/component=postgres
|
||||
```
|
||||
|
||||
By default `helm uninstall` retains the StatefulSet's PVCs, so reinstalling with the same release name preserves the database. If you've changed `postgresql.auth.password` in your values between uninstall and reinstall, you'll hit the trap on the reinstall — apply the non-destructive remediation above, or also delete the PVC.
|
||||
@@ -169,3 +169,26 @@ per affected resource. No-op when configured correctly.
|
||||
{{- fail "\n\nserver.tls.certManager.enabled=true but server.tls.certManager.issuerRef.name is empty.\n\nSet:\n --set server.tls.certManager.issuerRef.name=<your-issuer-or-clusterissuer>\n\nSee docs/tls.md.\n" -}}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Auth-type validation gate.
|
||||
|
||||
G-1 (P1): pre-G-1 the chart accepted server.auth.type=jwt and the
|
||||
certctl-server container silently routed every request through the
|
||||
api-key bearer middleware (no JWT impl ships with certctl). Post-G-1
|
||||
the chart fails at template-time with a pointer at the authenticating-
|
||||
gateway pattern. The valid set must stay in sync with
|
||||
internal/config.ValidAuthTypes() in the Go binary; if you add a value
|
||||
there you must add it here too (and update the property test in
|
||||
internal/config/config_test.go that pins both surfaces).
|
||||
|
||||
Any template that consumes .Values.server.auth.type should call
|
||||
`{{ include "certctl.validateAuthType" . }}` at the top so this guard
|
||||
runs once per affected resource. No-op when configured correctly.
|
||||
*/}}
|
||||
{{- define "certctl.validateAuthType" -}}
|
||||
{{- $valid := list "api-key" "none" -}}
|
||||
{{- if not (has .Values.server.auth.type $valid) -}}
|
||||
{{- fail (printf "\n\nserver.auth.type=%q is not supported (valid: %v).\n\nFor JWT/OIDC, run an authenticating gateway in front of certctl\n(oauth2-proxy / Envoy ext_authz / Traefik ForwardAuth / Pomerium) and\nset server.auth.type=none here so the gateway terminates federated\nidentity. See docs/architecture.md \"Authenticating-gateway pattern\"\nand docs/upgrade-to-v2-jwt-removal.md for the migration walkthrough.\n\nG-1 audit closure: pre-G-1 the chart accepted type=jwt and the binary\nsilently downgraded to api-key middleware. The chart now fails at\ntemplate time so misconfigured deployments cannot ship.\n" .Values.server.auth.type $valid) -}}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
{{- include "certctl.validateAuthType" . }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
{{- include "certctl.tls.required" . }}
|
||||
{{- include "certctl.validateAuthType" . }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
{{- include "certctl.validateAuthType" . }}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
|
||||
@@ -48,7 +48,14 @@ server:
|
||||
drop:
|
||||
- ALL
|
||||
|
||||
# Liveness and readiness probes (HTTPS-only as of v2.2)
|
||||
# Liveness and readiness probes (HTTPS-only as of v2.2).
|
||||
#
|
||||
# The two paths exposed for probes are `/health` and `/ready` —
|
||||
# registered in internal/api/router/router.go:76-85 and bypassing the
|
||||
# auth middleware via the no-auth list at cmd/server/main.go:920.
|
||||
# Both serve the same JSON shape today (`{"status":"healthy"}` /
|
||||
# `{"status":"ready"}`) but exist as separate routes so liveness and
|
||||
# readiness can diverge in the future without renaming.
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
@@ -59,9 +66,18 @@ server:
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
|
||||
# U-2 (P1, cat-u-healthcheck_protocol_mismatch — adjacent fix): pre-U-2
|
||||
# the readiness probe pointed at `/readyz`, the conventional kube-flavor
|
||||
# name. The certctl server doesn't register `/readyz` (only `/health`
|
||||
# and `/ready`) — see cmd/server/main.go:920 and
|
||||
# internal/api/router/router.go:81. K8s readiness probes therefore
|
||||
# received a 404 (or, with auth enabled, a 401 from the api-key middleware
|
||||
# because `/readyz` was NOT in the no-auth bypass set), pods stayed
|
||||
# `NotReady` indefinitely, and Helm rollouts stalled. Post-U-2 the path
|
||||
# matches a registered route.
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /readyz
|
||||
path: /ready
|
||||
port: https
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 5
|
||||
@@ -112,10 +128,23 @@ server:
|
||||
port: 8443
|
||||
annotations: {}
|
||||
|
||||
# Authentication configuration
|
||||
# Authentication configuration.
|
||||
# Valid types: "api-key" (production) or "none" (demo only — disables
|
||||
# authentication on the API and logs a loud Warn at server startup).
|
||||
# For JWT/OIDC, run an authenticating gateway in front of certctl
|
||||
# (oauth2-proxy / Envoy ext_authz / Traefik ForwardAuth / Pomerium)
|
||||
# and set type=none here so the gateway terminates federated identity.
|
||||
# See docs/architecture.md "Authenticating-gateway pattern".
|
||||
#
|
||||
# G-1 (P1): pre-G-1 the chart accepted server.auth.type=jwt and the
|
||||
# certctl-server container silently routed every request through the
|
||||
# api-key bearer middleware — silent auth downgrade. Post-G-1 the
|
||||
# chart's `certctl.validateAuthType` template helper rejects any value
|
||||
# outside {api-key, none} at template time. See
|
||||
# docs/upgrade-to-v2-jwt-removal.md if you previously set type=jwt.
|
||||
auth:
|
||||
type: api-key # Options: api-key, none (for demo only)
|
||||
apiKey: "" # REQUIRED in production - set via --set or values override
|
||||
type: api-key
|
||||
apiKey: "" # REQUIRED when type=api-key (set via --set or values override).
|
||||
|
||||
# Logging configuration
|
||||
logging:
|
||||
@@ -260,7 +289,30 @@ postgresql:
|
||||
auth:
|
||||
database: certctl
|
||||
username: certctl
|
||||
password: "" # REQUIRED - set via --set or values override
|
||||
# REQUIRED — set via `--set postgresql.auth.password=<value>` or values override.
|
||||
#
|
||||
# WARNING (U-1): rotating this value after first deploy does NOT change the
|
||||
# database password. The `postgres:16-alpine` image runs `initdb` only when
|
||||
# /var/lib/postgresql/data is empty, so POSTGRES_PASSWORD is written into
|
||||
# pg_authid exactly once — on the first boot of the StatefulSet's PVC.
|
||||
# Subsequent rollouts pick up the new env value in the postgres container
|
||||
# but the certctl-server container's CERTCTL_DATABASE_URL also picks up
|
||||
# the new value, while pg_authid still expects the old one — leading to
|
||||
# `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01).
|
||||
#
|
||||
# The certctl-server emits guidance via internal/repository/postgres/db.go::
|
||||
# wrapPingError when it sees SQLSTATE 28P01 at startup. To resolve in a
|
||||
# Helm deployment:
|
||||
# - Non-destructive (preferred for environments with data):
|
||||
# kubectl exec -it <release>-postgres-0 -- \
|
||||
# psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"
|
||||
# then update the secret/values to match and let the certctl-server
|
||||
# pod restart against the matching credential.
|
||||
# - Destructive (DESTROYS DATA — only acceptable on dev/demo PVCs):
|
||||
# helm uninstall <release> && \
|
||||
# kubectl delete pvc -l app.kubernetes.io/name=certctl,app.kubernetes.io/component=postgres && \
|
||||
# helm install <release> ... # PVC re-creates empty, initdb seeds new password
|
||||
password: ""
|
||||
|
||||
# Storage configuration
|
||||
storage:
|
||||
|
||||
@@ -0,0 +1,216 @@
|
||||
//go:build integration
|
||||
|
||||
// Package integration_test — image-level HEALTHCHECK contract.
|
||||
//
|
||||
// U-2 (P1, cat-u-healthcheck_protocol_mismatch): pre-U-2 the published
|
||||
// server image's Dockerfile HEALTHCHECK called `curl -f http://localhost:
|
||||
// 8443/health` against an HTTPS-only listener (HTTPS-Everywhere milestone,
|
||||
// v2.2 / tag v2.0.47). Operators outside docker-compose / Helm saw the
|
||||
// container reported as `unhealthy` indefinitely. The compose stack
|
||||
// overrode this HEALTHCHECK with `--cacert + https://`; the Helm chart
|
||||
// uses explicit `httpGet` probes that ignore Docker's HEALTHCHECK; the 5
|
||||
// example compose files all override with `curl -sfk https://localhost:
|
||||
// 8443/health`. So the observable failure was scoped to bare `docker run`
|
||||
// / Docker Swarm / Nomad / ECS users — exactly the "I just pulled the
|
||||
// published image" path.
|
||||
//
|
||||
// This file's tests pin the contract at the binary-image level. The
|
||||
// matching CI grep guardrail in .github/workflows/ci.yml catches the
|
||||
// regression at the Dockerfile-source level; both layers are needed
|
||||
// because someone could replace the HEALTHCHECK line with a sibling
|
||||
// broken pattern that the grep doesn't catch (e.g., a TCP-only check
|
||||
// against the HTTPS port).
|
||||
//
|
||||
// Run alongside the rest of the integration suite:
|
||||
//
|
||||
// cd deploy/test && go test -tags integration -v -run Healthcheck
|
||||
//
|
||||
// The tests skip cleanly with t.Skip when docker is not available
|
||||
// (CI without docker-in-docker, sandbox environments, etc.) so they
|
||||
// don't block local development on machines without docker.
|
||||
package integration_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// dockerAvailable returns true when `docker version` returns 0.
|
||||
// We cache it across tests in this file so the skip message prints once.
|
||||
func dockerAvailable(t *testing.T) bool {
|
||||
t.Helper()
|
||||
cmd := exec.Command("docker", "version", "--format", "{{.Server.Version}}")
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
t.Logf("docker not available: %v\noutput: %s", err, string(out))
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// dockerCmd runs `docker <args...>` with a 60s budget, returning stdout
|
||||
// + stderr combined and the exit error if any. Used for short-lived
|
||||
// probes (inspect, build, run -d).
|
||||
func dockerCmd(t *testing.T, timeout time.Duration, args ...string) (string, error) {
|
||||
t.Helper()
|
||||
cmd := exec.Command("docker", args...)
|
||||
done := make(chan struct{})
|
||||
var out []byte
|
||||
var err error
|
||||
go func() {
|
||||
out, err = cmd.CombinedOutput()
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
return string(out), err
|
||||
case <-time.After(timeout):
|
||||
_ = cmd.Process.Kill()
|
||||
t.Fatalf("docker %v timed out after %v", args, timeout)
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
// TestPublishedServerImage_HealthcheckSpecUsesHTTPS performs the Dockerfile-
|
||||
// source-level shipped-shape pin: the inspected image's Healthcheck.Test
|
||||
// array MUST contain "https://localhost:8443/health" (and MUST NOT
|
||||
// contain "http://localhost:8443/health"). This is the lightweight half
|
||||
// of the contract — it doesn't require running the container, only
|
||||
// building it. It catches the audit-flagged bug directly.
|
||||
func TestPublishedServerImage_HealthcheckSpecUsesHTTPS(t *testing.T) {
|
||||
if !dockerAvailable(t) {
|
||||
t.Skip("docker not available — skipping image-level HEALTHCHECK test")
|
||||
}
|
||||
|
||||
const imgTag = "certctl-u2-healthcheck-spec-test"
|
||||
t.Cleanup(func() {
|
||||
_, _ = dockerCmd(t, 30*time.Second, "rmi", "-f", imgTag)
|
||||
})
|
||||
|
||||
// Build the server image. Use the repo root as context (this test
|
||||
// file lives at deploy/test/, the Dockerfile at the repo root).
|
||||
buildOut, err := dockerCmd(t, 5*time.Minute,
|
||||
"build", "-f", "../../Dockerfile", "-t", imgTag, "../..")
|
||||
if err != nil {
|
||||
t.Fatalf("docker build failed: %v\noutput:\n%s", err, buildOut)
|
||||
}
|
||||
|
||||
// Inspect the shipped HEALTHCHECK metadata.
|
||||
inspectOut, err := dockerCmd(t, 30*time.Second,
|
||||
"inspect", "--format", "{{json .Config.Healthcheck}}", imgTag)
|
||||
if err != nil {
|
||||
t.Fatalf("docker inspect failed: %v\noutput:\n%s", err, inspectOut)
|
||||
}
|
||||
|
||||
var hc struct {
|
||||
Test []string
|
||||
Interval int64
|
||||
Timeout int64
|
||||
}
|
||||
if err := json.Unmarshal([]byte(strings.TrimSpace(inspectOut)), &hc); err != nil {
|
||||
t.Fatalf("could not parse Healthcheck JSON %q: %v", inspectOut, err)
|
||||
}
|
||||
|
||||
joined := strings.Join(hc.Test, " ")
|
||||
|
||||
// Positive contract.
|
||||
if !strings.Contains(joined, "https://localhost:8443/health") {
|
||||
t.Errorf("Healthcheck.Test does not target https://localhost:8443/health\nfull: %v", hc.Test)
|
||||
}
|
||||
|
||||
// Negative contract — pre-U-2 regression shape MUST be absent.
|
||||
if strings.Contains(joined, "http://localhost:8443/health") {
|
||||
t.Errorf("Healthcheck.Test still contains the pre-U-2 plaintext shape: %v", hc.Test)
|
||||
}
|
||||
|
||||
// `-k` (or `--insecure`) must be present because the bootstrap cert
|
||||
// is per-deploy and the published image can't pin a CA bundle —
|
||||
// see the U-2 closure docblock on Dockerfile and the audit doc.
|
||||
if !strings.Contains(joined, "-k") && !strings.Contains(joined, "--insecure") {
|
||||
t.Errorf("Healthcheck.Test omits -k / --insecure flag (required for self-signed bootstrap probe): %v", hc.Test)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPublishedAgentImage_HealthcheckSpecExists pins the U-2 adjacent
|
||||
// fix that added a HEALTHCHECK to the agent image. Pre-U-2 the agent
|
||||
// image had no HEALTHCHECK declaration, so bare-`docker run` agents got
|
||||
// `none` health status from Docker. Post-U-2 the agent uses pgrep to
|
||||
// verify the process is alive (mirroring the docker-compose pattern at
|
||||
// deploy/docker-compose.yml:173, which also became reliable post-U-2
|
||||
// because procps is now installed in the runtime image).
|
||||
func TestPublishedAgentImage_HealthcheckSpecExists(t *testing.T) {
|
||||
if !dockerAvailable(t) {
|
||||
t.Skip("docker not available — skipping image-level HEALTHCHECK test")
|
||||
}
|
||||
|
||||
const imgTag = "certctl-u2-agent-healthcheck-spec-test"
|
||||
t.Cleanup(func() {
|
||||
_, _ = dockerCmd(t, 30*time.Second, "rmi", "-f", imgTag)
|
||||
})
|
||||
|
||||
buildOut, err := dockerCmd(t, 5*time.Minute,
|
||||
"build", "-f", "../../Dockerfile.agent", "-t", imgTag, "../..")
|
||||
if err != nil {
|
||||
t.Fatalf("docker build failed: %v\noutput:\n%s", err, buildOut)
|
||||
}
|
||||
|
||||
inspectOut, err := dockerCmd(t, 30*time.Second,
|
||||
"inspect", "--format", "{{json .Config.Healthcheck}}", imgTag)
|
||||
if err != nil {
|
||||
t.Fatalf("docker inspect failed: %v\noutput:\n%s", err, inspectOut)
|
||||
}
|
||||
|
||||
trimmed := strings.TrimSpace(inspectOut)
|
||||
if trimmed == "null" || trimmed == "" {
|
||||
t.Fatalf("agent image has no HEALTHCHECK (got %q) — U-2 adjacent fix regressed", inspectOut)
|
||||
}
|
||||
|
||||
var hc struct {
|
||||
Test []string
|
||||
}
|
||||
if err := json.Unmarshal([]byte(trimmed), &hc); err != nil {
|
||||
t.Fatalf("could not parse Healthcheck JSON %q: %v", inspectOut, err)
|
||||
}
|
||||
|
||||
joined := strings.Join(hc.Test, " ")
|
||||
if !strings.Contains(joined, "pgrep") {
|
||||
t.Errorf("agent Healthcheck.Test does not use pgrep (lost the process-presence shape): %v", hc.Test)
|
||||
}
|
||||
if !strings.Contains(joined, "certctl-agent") {
|
||||
t.Errorf("agent Healthcheck.Test does not target the certctl-agent process name: %v", hc.Test)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPublishedServerImage_HealthcheckTransitionsToHealthy is the
|
||||
// runtime-level contract: the built image, when started, must transition
|
||||
// to `healthy` within the start-period + 30s observability budget. This
|
||||
// is the heavy test — it requires the server to actually start, which
|
||||
// in turn requires either a reachable database OR a startup that fails
|
||||
// gracefully enough to keep the HEALTHCHECK probe target alive.
|
||||
//
|
||||
// The container is started with CERTCTL_DATABASE_URL pointing at an
|
||||
// unreachable host so the server fails its postgres bring-up — but
|
||||
// importantly, fails AFTER the TLS listener has come up, because the
|
||||
// HEALTHCHECK probe target is the TLS listener. We don't actually need
|
||||
// the database to validate the HEALTHCHECK shape.
|
||||
//
|
||||
// IMPORTANT: this test is the runtime contract. If you're working on the
|
||||
// server's startup ordering and the listener now comes up AFTER the
|
||||
// database, this test must adapt — start a sidecar postgres via
|
||||
// testcontainers-go (see internal/integration/lifecycle_test.go for the
|
||||
// pattern) and connect the certctl-server container to it.
|
||||
func TestPublishedServerImage_HealthcheckTransitionsToHealthy(t *testing.T) {
|
||||
if !dockerAvailable(t) {
|
||||
t.Skip("docker not available — skipping runtime HEALTHCHECK test")
|
||||
}
|
||||
if testing.Short() {
|
||||
t.Skip("runtime HEALTHCHECK test takes ~45s; skipping under -short")
|
||||
}
|
||||
t.Skip("runtime probe contract not yet wired to a sidecar postgres; " +
|
||||
"image-spec contract above (TestPublishedServerImage_HealthcheckSpecUsesHTTPS) " +
|
||||
"covers the audit-flagged regression. Re-enable once the integration " +
|
||||
"harness provisions postgres for image-level smoke.")
|
||||
}
|
||||
+12
-1
@@ -348,6 +348,11 @@ erDiagram
|
||||
}
|
||||
```
|
||||
|
||||
The ER diagram above documents **database shape**, not REST-API wire shape. Several columns are intentionally server-internal and never serialized to clients:
|
||||
|
||||
- `agents.api_key_hash` — SHA-256 of the agent's plaintext API key, populated by `service.RegisterAgent` (`hashAPIKey(apiKey)` at `internal/service/agent.go`) and consumed by `repository.AgentRepository::GetByAPIKey` for the auth-lookup. **Not** exposed via the REST API, **not** echoed via CLI / MCP / agent registration response, **never** logged. Enforced by `internal/domain/connector.go::Agent.MarshalJSON` (G-2 audit closure, `cat-s5-apikey_leak`); the OpenAPI Agent schema explicitly excludes the field, the frontend `Agent` interface omits it, and a CI grep guardrail at `.github/workflows/ci.yml` blocks reintroduction.
|
||||
- `issuers.config` / `deployment_targets.config` — plaintext jsonb shadow of the AES-GCM-encrypted on-disk blob; the encrypted form lives on `EncryptedConfig []byte` (Go-only field tagged `json:"-"`).
|
||||
|
||||
Migrations are idempotent (`IF NOT EXISTS` on all CREATE statements, `ON CONFLICT (id) DO NOTHING` on all seed data) so they're safe to run multiple times — important for Docker Compose where both initdb and the server may run the same SQL.
|
||||
|
||||
## Data Flow: Certificate Lifecycle
|
||||
@@ -891,9 +896,15 @@ The HTTP middleware stack processes requests in the following order (see `cmd/se
|
||||
4. **BodyLimit** - request body size cap via `http.MaxBytesReader`
|
||||
5. **RateLimiter** - token bucket rate limiting (optional, when enabled)
|
||||
6. **CORS** - cross-origin request handling (deny-by-default)
|
||||
7. **Auth** - API key or JWT validation
|
||||
7. **Auth** - API key validation (or none in development; JWT/OIDC via authenticating gateway, see below — not in-process)
|
||||
8. **AuditLog** - records every API call to the audit trail (requires auth context for actor)
|
||||
|
||||
### Authenticating-gateway pattern (JWT, OIDC, mTLS)
|
||||
|
||||
certctl's in-process authentication surface is intentionally narrow: `api-key` for production deployments and `none` for development. There is no in-process JWT, OIDC, mTLS, or SAML middleware. (`CERTCTL_AUTH_TYPE=jwt` was accepted pre-G-1 but silently routed through the api-key bearer middleware — a security finding masquerading as a config option, removed at the v2.x boundary; see [`upgrade-to-v2-jwt-removal.md`](upgrade-to-v2-jwt-removal.md) if you previously set it.)
|
||||
|
||||
For deployments that need JWT/OIDC/mTLS, the standard pattern is to put an authenticating gateway in front of certctl and configure `CERTCTL_AUTH_TYPE=none` on the upstream certctl process. The gateway terminates the federated identity protocol, validates tokens / certificates / SAML assertions, and proxies the authenticated request to certctl as a same-origin call on a private network. This separation gives operators the full breadth of the modern identity ecosystem (oauth2-proxy, Envoy `ext_authz`, Traefik `ForwardAuth`, Pomerium, Authelia, Caddy `forward_auth`, Apache `mod_auth_openidc`, nginx `auth_request`) without certctl itself having to track signing-key rotation, claim mapping, audience validation, and the rest of the JWT/OIDC surface area. Operators wanting per-request actor attribution past the gateway boundary forward the gateway-resolved identity (e.g., `X-Auth-Request-User` from oauth2-proxy) and run a small authorization layer at the gateway that enforces the bearer-key contract certctl actually uses.
|
||||
|
||||
### Concurrency Safety
|
||||
|
||||
The background scheduler uses `sync/atomic.Bool` idempotency guards on every loop (8 always-on plus up to 4 optional) — if a tick fires while the previous iteration is still running, it skips. A `sync.WaitGroup` tracks all in-flight goroutines. `WaitForCompletion(timeout)` blocks during shutdown until all work finishes or the timeout expires, preventing state corruption from mid-flight database operations during process exit.
|
||||
|
||||
+32
-15
@@ -1141,13 +1141,30 @@ API Endpoints:
|
||||
- **`GET /api/v1/digest/preview`** — Render digest HTML for preview (no email sent)
|
||||
- **`POST /api/v1/digest/send`** — Trigger digest send immediately (outside of schedule)
|
||||
|
||||
> **Note (HTTPS-only as of v2.2):** The `curl` examples in this section
|
||||
> and below all target the HTTPS-only control plane. Extract the
|
||||
> docker-compose self-signed bootstrap CA bundle once and reuse it on
|
||||
> every call:
|
||||
>
|
||||
> ```bash
|
||||
> export CA=/tmp/certctl-ca.crt
|
||||
> docker compose -f deploy/docker-compose.yml exec -T certctl-server \
|
||||
> cat /etc/certctl/tls/ca.crt > "$CA"
|
||||
> ```
|
||||
>
|
||||
> Then pass `--cacert "$CA"` (or `-k` for one-off smoke tests, never in
|
||||
> production). The same pattern is documented in
|
||||
> [`quickstart.md`](quickstart.md). Pre-U-2 these examples used `http://`
|
||||
> and silently failed against the HTTPS listener; post-U-2 they speak
|
||||
> HTTPS with the operator-managed CA bundle.
|
||||
|
||||
Example:
|
||||
```bash
|
||||
# Preview digest
|
||||
curl http://localhost:8443/api/v1/digest/preview | jq '.html'
|
||||
curl --cacert "$CA" https://localhost:8443/api/v1/digest/preview | jq '.html'
|
||||
|
||||
# Send digest immediately
|
||||
curl -X POST http://localhost:8443/api/v1/digest/send
|
||||
curl --cacert "$CA" -X POST https://localhost:8443/api/v1/digest/send
|
||||
```
|
||||
|
||||
Each notifier is enabled by its configuration env var:
|
||||
@@ -1294,24 +1311,24 @@ The agent scans these directories on startup and every 6 hours, looking for cert
|
||||
|
||||
```bash
|
||||
# List discovered certificates (filter by agent, status)
|
||||
curl -s "http://localhost:8443/api/v1/discovered-certificates?agent_id=agent-nginx-01&status=new" | jq .
|
||||
curl --cacert "$CA" -s "https://localhost:8443/api/v1/discovered-certificates?agent_id=agent-nginx-01&status=new" | jq .
|
||||
|
||||
# Get discovery detail
|
||||
curl -s http://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID | jq .
|
||||
|
||||
# Claim a discovered cert (link to managed certificate)
|
||||
curl -s -X POST http://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID/claim \
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID/claim \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"managed_certificate_id": "mc-api-prod"}' | jq .
|
||||
|
||||
# Dismiss a discovery
|
||||
curl -s -X POST http://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID/dismiss | jq .
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ID/dismiss | jq .
|
||||
|
||||
# View discovery scan history
|
||||
curl -s http://localhost:8443/api/v1/discovery-scans | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/discovery-scans | jq .
|
||||
|
||||
# Summary counts (new, claimed, dismissed)
|
||||
curl -s http://localhost:8443/api/v1/discovery-summary | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/discovery-summary | jq .
|
||||
```
|
||||
|
||||
### Use Cases
|
||||
@@ -1340,7 +1357,7 @@ Network scan targets can be managed from the **Network Scans** dashboard page (c
|
||||
|
||||
```bash
|
||||
# Create a scan target for your internal network (or use the dashboard's "+ New Target" button)
|
||||
curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/network-scan-targets \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "Production Web Servers",
|
||||
@@ -1365,26 +1382,26 @@ curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \
|
||||
|
||||
```bash
|
||||
# List all scan targets
|
||||
curl -s http://localhost:8443/api/v1/network-scan-targets | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/network-scan-targets | jq .
|
||||
|
||||
# Create a scan target
|
||||
curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/network-scan-targets \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "DMZ", "cidrs": ["172.16.0.0/24"], "ports": [443]}' | jq .
|
||||
|
||||
# Get a specific target (includes last_scan_at, last_scan_certs_found)
|
||||
curl -s http://localhost:8443/api/v1/network-scan-targets/nst-dmz | jq .
|
||||
curl --cacert "$CA" -s https://localhost:8443/api/v1/network-scan-targets/nst-dmz | jq .
|
||||
|
||||
# Trigger an immediate scan (doesn't wait for scheduler)
|
||||
curl -s -X POST http://localhost:8443/api/v1/network-scan-targets/nst-dmz/scan | jq .
|
||||
curl --cacert "$CA" -s -X POST https://localhost:8443/api/v1/network-scan-targets/nst-dmz/scan | jq .
|
||||
|
||||
# Update scan configuration
|
||||
curl -s -X PUT http://localhost:8443/api/v1/network-scan-targets/nst-dmz \
|
||||
curl --cacert "$CA" -s -X PUT https://localhost:8443/api/v1/network-scan-targets/nst-dmz \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"ports": [443, 8443, 9443], "timeout_ms": 3000}' | jq .
|
||||
|
||||
# Delete a scan target
|
||||
curl -s -X DELETE http://localhost:8443/api/v1/network-scan-targets/nst-dmz
|
||||
curl --cacert "$CA" -s -X DELETE https://localhost:8443/api/v1/network-scan-targets/nst-dmz
|
||||
```
|
||||
|
||||
### Scheduler Integration
|
||||
|
||||
@@ -60,6 +60,8 @@ cp deploy/.env.example deploy/.env
|
||||
docker compose -f deploy/docker-compose.yml up -d --build
|
||||
```
|
||||
|
||||
> **Warning:** Edit `POSTGRES_PASSWORD` *before* the very first `docker compose up`. Postgres seeds the password into its data directory only on first boot of an empty volume — after that, the password is baked into `pg_authid` and the env var is ignored. If you boot once with the default and later change `POSTGRES_PASSWORD` in `.env`, the certctl-server container picks up the new value but postgres still authenticates against the old one, and the server logs `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). Two ways out: tear down the volume with `docker compose -f deploy/docker-compose.yml down -v` (this **deletes all data**) and bring up fresh, or rotate non-destructively with `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"` and then restart certctl-server with the matching `POSTGRES_PASSWORD`.
|
||||
|
||||
### Docker Compose Environments
|
||||
|
||||
The `deploy/` directory contains four compose files for different use cases:
|
||||
|
||||
@@ -0,0 +1,155 @@
|
||||
# Upgrading past G-1 — `CERTCTL_AUTH_TYPE=jwt` removal
|
||||
|
||||
If your certctl deployment currently sets `CERTCTL_AUTH_TYPE=jwt` (or `server.auth.type=jwt` in Helm), the next certctl upgrade will fail-fast at startup with a dedicated diagnostic. This guide explains why, what to switch to, and how to keep JWT/OIDC at your edge.
|
||||
|
||||
For everyone else — operators running `api-key` or `none` — this upgrade is a no-op. Skip to [`upgrade-to-tls.md`](upgrade-to-tls.md) for the v2.2 HTTPS-everywhere migration if you haven't done that one yet.
|
||||
|
||||
## Why we removed it
|
||||
|
||||
Pre-G-1, the config validator at `internal/config/config.go` accepted three values for `CERTCTL_AUTH_TYPE`: `api-key`, `jwt`, and `none`. The startup log line at `cmd/server/main.go` faithfully echoed `"authentication enabled" "type"="jwt"` when an operator picked `jwt`. Reasonable people read that and concluded JWT auth was on.
|
||||
|
||||
It wasn't. Grep `internal/ cmd/` for `NewJWT`, `JWTMiddleware`, or `jwt.Parse` — pre-G-1, there were zero matches in production code. The auth-middleware wiring at `cmd/server/main.go:653` unconditionally called `middleware.NewAuthWithNamedKeys(namedKeys)` regardless of `cfg.Auth.Type`. So `CERTCTL_AUTH_TYPE=jwt` just routed every request through the api-key bearer middleware, comparing the incoming `Authorization: Bearer <something>` against whatever string the operator put in `CERTCTL_AUTH_SECRET`. Real JWT clients got 401 (the api-key middleware saw the JWT string as a literal token and compared bytes). Operators who treated `CERTCTL_AUTH_SECRET` as a JWT signing secret (and therefore handled it less carefully than an api-key) handed an attacker an api-key. Silent auth downgrade — a security finding masquerading as a config option.
|
||||
|
||||
We chose to remove the option rather than implement JWT middleware. Implementing real JWT/OIDC requires jwks vs static-secret rotation, claim mapping (which claim is the actor / the admin flag?), expiry enforcement, audience and issuer validation, key rollover semantics, and regression coverage at the same depth as the existing api-key path. That's a feature, not a fix. The audit-recommended structural fix — and the one that actually closes the hazard — is to fail loudly instead of silently downgrading.
|
||||
|
||||
## What changes at startup
|
||||
|
||||
Post-G-1, a binary started with `CERTCTL_AUTH_TYPE=jwt` exits non-zero before opening the listener:
|
||||
|
||||
```
|
||||
Failed to load configuration: CERTCTL_AUTH_TYPE=jwt is no longer accepted
|
||||
(G-1 silent auth downgrade): no JWT middleware ships with certctl. To use
|
||||
JWT/OIDC, run an authenticating gateway (oauth2-proxy / Envoy ext_authz /
|
||||
Traefik ForwardAuth / Pomerium) in front of certctl and set
|
||||
CERTCTL_AUTH_TYPE=none on the upstream. See docs/architecture.md
|
||||
"Authenticating-gateway pattern" and docs/upgrade-to-v2-jwt-removal.md
|
||||
for the migration walkthrough
|
||||
```
|
||||
|
||||
Helm operators get the same shape at `helm install` / `helm upgrade` template time: `server.auth.type=jwt` is rejected by the chart's `certctl.validateAuthType` template helper before any Kubernetes object is rendered.
|
||||
|
||||
The CI-side regression guard at `.github/workflows/ci.yml` blocks any future PR that re-introduces `"jwt"` as an auth-type literal in production code or spec.
|
||||
|
||||
## Recovery — pick one
|
||||
|
||||
### Option A — switch to `api-key` (you weren't actually using JWT)
|
||||
|
||||
If your `CERTCTL_AUTH_SECRET` was a single high-entropy token and your clients sent it as `Authorization: Bearer <token>`, you were already using api-key auth — you just had `CERTCTL_AUTH_TYPE` set to the wrong string. Flip it:
|
||||
|
||||
```
|
||||
# .env (docker-compose)
|
||||
CERTCTL_AUTH_TYPE=api-key
|
||||
CERTCTL_AUTH_SECRET=<your-existing-token>
|
||||
```
|
||||
|
||||
```
|
||||
# Helm
|
||||
helm upgrade <release> deploy/helm/certctl/ \
|
||||
--reuse-values \
|
||||
--set server.auth.type=api-key \
|
||||
--set server.auth.apiKey=<your-existing-token>
|
||||
```
|
||||
|
||||
No client changes needed — the same Bearer token continues to work. The startup log will now read `"authentication enabled" "type"="api-key"`, which matches what was actually happening pre-G-1.
|
||||
|
||||
### Option B — front certctl with an authenticating gateway
|
||||
|
||||
If you genuinely need JWT, OIDC, mTLS, or SAML, run an authenticating gateway in front of certctl and let the gateway terminate the federated identity protocol. Configure certctl for `CERTCTL_AUTH_TYPE=none`:
|
||||
|
||||
```
|
||||
CERTCTL_AUTH_TYPE=none
|
||||
```
|
||||
|
||||
Then put an oauth2-proxy / Envoy `ext_authz` / Traefik `ForwardAuth` / Pomerium / Authelia (etc.) in the network path between operators and certctl. The gateway validates the identity and proxies the authenticated request to certctl as a same-origin call on a private network.
|
||||
|
||||
### Concrete walkthrough — oauth2-proxy + certctl on docker-compose
|
||||
|
||||
This is the simplest production-grade JWT/OIDC shape. It assumes you have an OIDC provider (Okta, Auth0, Google Workspace, Keycloak, Dex) and a registered client_id / client_secret.
|
||||
|
||||
```yaml
|
||||
# deploy/docker-compose.gateway.yml — overlay on the base compose file
|
||||
services:
|
||||
oauth2-proxy:
|
||||
image: quay.io/oauth2-proxy/oauth2-proxy:latest
|
||||
command:
|
||||
- --provider=oidc
|
||||
- --oidc-issuer-url=https://<your-issuer>/
|
||||
- --client-id=${OIDC_CLIENT_ID}
|
||||
- --client-secret=${OIDC_CLIENT_SECRET}
|
||||
- --cookie-secret=${OAUTH2_PROXY_COOKIE_SECRET} # openssl rand -base64 32
|
||||
- --upstream=http://certctl-server:8443 # internal-network only; certctl listens on 8443
|
||||
- --http-address=0.0.0.0:4180
|
||||
- --email-domain=*
|
||||
- --pass-access-token=true
|
||||
- --pass-authorization-header=true
|
||||
- --set-authorization-header=true # forwards a bearer token upstream
|
||||
- --skip-provider-button=true
|
||||
- --reverse-proxy=true
|
||||
ports:
|
||||
- "443:4180"
|
||||
depends_on:
|
||||
- certctl-server
|
||||
networks:
|
||||
- certctl-network
|
||||
|
||||
certctl-server:
|
||||
environment:
|
||||
CERTCTL_AUTH_TYPE: none # gateway terminates auth — see docs/upgrade-to-v2-jwt-removal.md
|
||||
# ... rest of the certctl env block unchanged
|
||||
```
|
||||
|
||||
Operators hit `https://<your-host>/`, get redirected through the OIDC provider, land back at oauth2-proxy with a session cookie, and oauth2-proxy proxies their request to certctl on the internal Docker network. certctl itself is HTTPS-only on `:8443` (TLS 1.3, see [`tls.md`](tls.md)) but operator browsers never see that hop directly. Bind certctl-server's `:8443` to the internal Docker network only — do NOT publish it to the host. The audit trail will record the actor as the gateway-forwarded identity if you also configure a small bearer-token-mapping shim at the gateway (most production deployments do this with a per-user api-key issued by the gateway after OIDC validation).
|
||||
|
||||
### Traefik ForwardAuth pattern (Kubernetes)
|
||||
|
||||
Same shape, kubernetes-flavored:
|
||||
|
||||
```yaml
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: oidc-forward-auth
|
||||
spec:
|
||||
forwardAuth:
|
||||
address: http://oauth2-proxy.auth.svc.cluster.local:4180
|
||||
trustForwardHeader: true
|
||||
authResponseHeaders:
|
||||
- X-Auth-Request-User
|
||||
- X-Auth-Request-Email
|
||||
- Authorization
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: certctl
|
||||
spec:
|
||||
routes:
|
||||
- match: Host(`certctl.example.com`)
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: oidc-forward-auth
|
||||
services:
|
||||
- name: certctl-server
|
||||
port: 8443
|
||||
```
|
||||
|
||||
The certctl Helm release runs with `server.auth.type=none`. The Traefik IngressRoute attaches `oidc-forward-auth` as a middleware so every request is OIDC-validated by oauth2-proxy before reaching certctl.
|
||||
|
||||
### Envoy `ext_authz` pattern
|
||||
|
||||
For service-mesh deployments (Istio, Consul, plain Envoy), the `ext_authz` filter calls out to an external authorization service per-request. Same outcome: certctl runs `CERTCTL_AUTH_TYPE=none` and Envoy + your authz service handle JWT/OIDC/mTLS at the mesh edge. See the [Envoy ext_authz docs](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_authz_filter) for the configuration surface.
|
||||
|
||||
## Rollback
|
||||
|
||||
Pre-G-1 binaries silently accepted `CERTCTL_AUTH_TYPE=jwt` and routed through the api-key middleware. Downgrading the binary is the only mechanical rollback path, and it puts you back into the silent-downgrade state — which is exactly what the G-1 audit finding is about. We don't recommend it. If something is forcing your hand, capture the operational issue you're hitting and open a GitHub issue against the certctl repo with the SHAs involved; the Authenticating-gateway pattern was specifically designed to cover the use cases that historically led operators to set `CERTCTL_AUTH_TYPE=jwt`.
|
||||
|
||||
There is no on-disk state that changes with this upgrade — no migrations to roll back, no encrypted config to re-encode, no certificates to re-issue. The change is entirely in the config-validation surface and the helm-chart template guard.
|
||||
|
||||
## Cross-references
|
||||
|
||||
- [`architecture.md`](architecture.md) — "Authenticating-gateway pattern (JWT, OIDC, mTLS)" section.
|
||||
- [`tls.md`](tls.md) — TLS provisioning patterns. The gateway proxying to certctl-server still needs to trust certctl's TLS cert; same patterns apply.
|
||||
- [`../deploy/helm/certctl/README.md`](../deploy/helm/certctl/README.md) — Helm-chart-flavored guidance.
|
||||
- `internal/config/config.go::ValidAuthTypes` — the single source of truth for what's accepted post-G-1.
|
||||
- `internal/repository/postgres/db.go::wrapPingError` — unrelated; pattern for runtime diagnostic of operator misconfiguration.
|
||||
- `coverage-gap-audit-2026-04-24-v5/unified-audit.md` — the audit finding (`cat-g-jwt_silent_auth_downgrade`).
|
||||
@@ -0,0 +1,55 @@
|
||||
# Deployment Examples
|
||||
|
||||
Five turnkey docker-compose scenarios that show certctl deployed against real CA backends and target shapes. Each subdirectory is self-contained — pick the one closest to your stack and have it running in minutes.
|
||||
|
||||
| Example | Stack | What it shows |
|
||||
|---------|-------|---------------|
|
||||
| [`acme-nginx/`](acme-nginx/acme-nginx.md) | Let's Encrypt + NGINX (HTTP-01) | The default public-CA path: ACME-issued certs deployed to NGINX. |
|
||||
| [`acme-wildcard-dns01/`](acme-wildcard-dns01/acme-wildcard-dns01.md) | Let's Encrypt wildcard (DNS-01) | Wildcard certificates via DNS-01 with pluggable DNS hooks. |
|
||||
| [`private-ca-traefik/`](private-ca-traefik/private-ca-traefik.md) | Local CA + Traefik | Internal-only certs from a private CA, deployed to Traefik. |
|
||||
| [`step-ca-haproxy/`](step-ca-haproxy/step-ca-haproxy.md) | Smallstep step-ca + HAProxy | Self-hosted CA with HAProxy as the deployment target. |
|
||||
| [`multi-issuer/`](multi-issuer/multi-issuer.md) | Let's Encrypt + Local CA | Public + private certs side-by-side from a single dashboard. |
|
||||
|
||||
## Common operational notes
|
||||
|
||||
These notes apply to **every** example. They're called out here so the per-example walkthroughs stay focused on the issuer/target wiring instead of repeating ops boilerplate.
|
||||
|
||||
### Postgres password rotation — first-boot binding trap (U-1)
|
||||
|
||||
Every example file uses `${DB_PASSWORD:-certctl-dev-password}` as the postgres password env var, with the data directory persisted via a named volume. The `postgres:16-alpine` image runs `initdb` exactly once — when `/var/lib/postgresql/data` is empty — and that's the only time `POSTGRES_PASSWORD` is written into `pg_authid`. If you boot once with the default and then change `DB_PASSWORD` (in your shell, in a `.env` file, or in a wrapper script), the certctl-server container picks up the new value but the postgres container continues to authenticate against the old one. The server fails its startup `db.Ping()` with `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01).
|
||||
|
||||
The certctl-server emits guidance pointing at the fix when this fires (see `internal/repository/postgres/db.go::wrapPingError`). The two remediation paths:
|
||||
|
||||
- **Destructive — wipes all certctl data, only acceptable on demo/test setups:**
|
||||
```bash
|
||||
docker compose -f examples/<example>/docker-compose.yml down -v
|
||||
docker compose -f examples/<example>/docker-compose.yml up -d --build
|
||||
```
|
||||
- **Non-destructive — preserves data, rotates `pg_authid` in place:**
|
||||
```bash
|
||||
docker compose -f examples/<example>/docker-compose.yml exec postgres \
|
||||
psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"
|
||||
# Then redeploy with DB_PASSWORD set to <new> in your shell or .env
|
||||
```
|
||||
|
||||
The cleanest practice for a fresh demo: set `DB_PASSWORD` once in your shell **before** the very first `docker compose up`, and don't change it during the demo's lifetime. If you must rotate, use the non-destructive path.
|
||||
|
||||
Same root cause and remediation pattern is documented for the canonical quickstart in [`../docs/quickstart.md`](../docs/quickstart.md), the production compose surface in [`../deploy/ENVIRONMENTS.md`](../deploy/ENVIRONMENTS.md), and the Helm chart in [`../deploy/helm/certctl/README.md`](../deploy/helm/certctl/README.md).
|
||||
|
||||
### TLS for the certctl control plane
|
||||
|
||||
Every example boots certctl with HTTPS-only on port 8443 (TLS 1.3 pinned, no plaintext listener as of v2.2). The shipped `certctl-tls-init` init container generates a self-signed ECDSA-P256 cert on first boot — fine for the example demos, **never** acceptable for a public deployment. For production, swap the init container for cert-manager, an operator-supplied Secret, or your internal CA — see [`../docs/tls.md`](../docs/tls.md) for the full pattern matrix.
|
||||
|
||||
### Tearing down
|
||||
|
||||
To stop services but **keep** the postgres volume (so you can pick up where you left off):
|
||||
```bash
|
||||
docker compose -f examples/<example>/docker-compose.yml down
|
||||
```
|
||||
|
||||
To stop services **and** wipe all data (clean slate for the next run):
|
||||
```bash
|
||||
docker compose -f examples/<example>/docker-compose.yml down -v
|
||||
```
|
||||
|
||||
Note that `down -v` is the only canonical way to recover from the postgres-password trap when the non-destructive `ALTER ROLE` route is unavailable (e.g., you've forgotten the original password).
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This example demonstrates certctl's core use case: **automatically manage TLS certificates for NGINX using Let's Encrypt (ACME HTTP-01 challenges).**
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
## What This Does
|
||||
|
||||
- Deploys certctl server (control plane) with PostgreSQL
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
**What this does:** Issues wildcard certificates (e.g., `*.example.com`) from Let's Encrypt using DNS-01 challenge validation.
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
This example is ideal for:
|
||||
- Issuing wildcard certificates (`*.example.com`)
|
||||
- Services behind NAT, firewalls, or non-public networks
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This example demonstrates certctl managing **both public and internal certificates from a single dashboard**. Public-facing services use Let's Encrypt (ACME), while internal services use a private Local CA — all visible and managed in one place.
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
## The Use Case
|
||||
|
||||
You have:
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
# Private CA + Traefik Example
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
This example demonstrates certctl managing certificates for **internal services without public CA dependency**. Ideal for enterprise environments where:
|
||||
|
||||
- All services are internal (VPN, private networks)
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This example demonstrates certctl managing certificates issued by **Smallstep step-ca** and deploying them to **HAProxy**.
|
||||
|
||||
> **Operational notes** shared by every example (postgres password rotation trap, TLS provisioning, teardown semantics) live in [`../README.md`](../README.md). Read it first if you plan to change `DB_PASSWORD` after the initial `docker compose up` — the postgres volume binds the password on first boot only.
|
||||
|
||||
## Scenario
|
||||
|
||||
You're a Smallstep user running step-ca as your internal PKI. You have HAProxy load balancers that need certificates. This setup:
|
||||
|
||||
@@ -3,14 +3,12 @@ package handler
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/api/middleware"
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/service"
|
||||
)
|
||||
|
||||
// AgentGroupService defines the service interface for agent group operations.
|
||||
@@ -24,19 +22,6 @@ type AgentGroupService interface {
|
||||
}
|
||||
|
||||
// AgentGroupHandler handles HTTP requests for agent group operations.
|
||||
//
|
||||
// Error dispatch (post-M-1): every service error routes through the [errToStatus]
|
||||
// choke point via `errors.Is` walking the wrap chain, with one explicit
|
||||
// [service.ErrValidation] arm on the write paths (Create, Update) so the
|
||||
// composed "validation: <field-specific reason>" message the service layer
|
||||
// attaches via `fmt.Errorf("%w: ...", ErrValidation)` can be passed through to
|
||||
// the 400 response body. Before M-1, the Create handler branched on
|
||||
// `strings.Contains(err.Error(), "invalid"|"required")` — fragile because a
|
||||
// single reword in [service.validateAgentGroup] would demote the 400 to 500
|
||||
// with no compile-time signal — and the Update/Delete handlers branched on
|
||||
// `strings.Contains(err.Error(), "not found")`, coupling HTTP classification
|
||||
// to repository human-readable strings. Both now ride the typed
|
||||
// [repository.ErrNotFound] wrap through errToStatus. Mirrors ProfileHandler.
|
||||
type AgentGroupHandler struct {
|
||||
svc AgentGroupService
|
||||
}
|
||||
@@ -104,18 +89,7 @@ func (h AgentGroupHandler) GetAgentGroup(w http.ResponseWriter, r *http.Request)
|
||||
|
||||
group, err := h.svc.GetAgentGroup(r.Context(), id)
|
||||
if err != nil {
|
||||
// M-1: route through errToStatus so a repo-level `sql.ErrNoRows`
|
||||
// (wrapped as repository.ErrNotFound) becomes 404, but a transient DB
|
||||
// failure no longer masquerades as 404 — it correctly surfaces 500. The
|
||||
// pre-M-1 "any error → 404" shortcut was plausible when Get's only
|
||||
// expected failure was "not found", but the choke point now gives us
|
||||
// correct dispatch for free. Mirrors GetProfile.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to get agent group"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Agent group not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Agent group not found", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -149,15 +123,7 @@ func (h AgentGroupHandler) CreateAgentGroup(w http.ResponseWriter, r *http.Reque
|
||||
|
||||
created, err := h.svc.CreateAgentGroup(r.Context(), group)
|
||||
if err != nil {
|
||||
// M-1: replace the 2-term substring net (`"invalid"|"required"`) with a
|
||||
// single `errors.Is(err, service.ErrValidation)` arm. validateAgentGroup
|
||||
// wraps every field-specific failure via `fmt.Errorf("%w: <reason>",
|
||||
// ErrValidation)`, so `err.Error()` still contains the human-readable
|
||||
// reason (e.g., "agent group name is required") and can be safely passed
|
||||
// to the 400 body — but the status decision no longer depends on the
|
||||
// exact wording. Other errors redact to a generic 500. Mirrors
|
||||
// CreateProfile.
|
||||
if errors.Is(err, service.ErrValidation) {
|
||||
if strings.Contains(err.Error(), "invalid") || strings.Contains(err.Error(), "required") {
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, err.Error(), requestID)
|
||||
return
|
||||
}
|
||||
@@ -194,22 +160,11 @@ func (h AgentGroupHandler) UpdateAgentGroup(w http.ResponseWriter, r *http.Reque
|
||||
|
||||
updated, err := h.svc.UpdateAgentGroup(r.Context(), id, group)
|
||||
if err != nil {
|
||||
// M-1: explicit ErrValidation arm preserves the user-facing reason in
|
||||
// the 400 body (validateAgentGroup wraps every failure via
|
||||
// `fmt.Errorf("%w: ...", ErrValidation)`); every other error — including
|
||||
// repo-layer ErrNotFound on a missing row — routes through errToStatus
|
||||
// so the 404/500 decision no longer depends on substring matching.
|
||||
// Mirrors UpdateProfile.
|
||||
if errors.Is(err, service.ErrValidation) {
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, err.Error(), requestID)
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Agent group not found", requestID)
|
||||
return
|
||||
}
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to update agent group"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Agent group not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to update agent group", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -233,15 +188,11 @@ func (h AgentGroupHandler) DeleteAgentGroup(w http.ResponseWriter, r *http.Reque
|
||||
}
|
||||
|
||||
if err := h.svc.DeleteAgentGroup(r.Context(), id); err != nil {
|
||||
// M-1: sentinel dispatch replaces the substring 404 check — see the
|
||||
// parallel comment block in UpdateAgentGroup for the rationale. Mirrors
|
||||
// DeleteProfile.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to delete agent group"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Agent group not found"
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Agent group not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to delete agent group", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -893,3 +893,161 @@ func TestAgentReportJobStatus_ServiceError(t *testing.T) {
|
||||
func stringPtr(s string) *string {
|
||||
return &s
|
||||
}
|
||||
|
||||
// G-2 (P1): cat-s5-apikey_leak audit closure tests. Pre-G-2,
|
||||
// Agent.APIKeyHash was tagged `json:"api_key_hash"` and shipped on
|
||||
// every wire surface that returned domain.Agent. Post-G-2 the tag is
|
||||
// "-" and Agent.MarshalJSON enforces redaction via a marshal-time copy
|
||||
// (see internal/domain/connector_test.go for the type-level pin). These
|
||||
// four tests are the wire-shape contract — they capture the actual HTTP
|
||||
// response body via httptest and assert the credential-derivative hash
|
||||
// is absent.
|
||||
//
|
||||
// One sentinel value (g2HandlerLeakSentinel) flows through every fixture
|
||||
// so a single grep over a failing test's output identifies the leak
|
||||
// surface immediately.
|
||||
const g2HandlerLeakSentinel = "sha256:LEAKED-CREDENTIAL-DERIVATIVE-HANDLER-SENTINEL"
|
||||
|
||||
func TestListAgents_DoesNotLeakAPIKeyHash(t *testing.T) {
|
||||
now := time.Now()
|
||||
mock := &MockAgentService{
|
||||
ListAgentsFn: func(page, perPage int) ([]domain.Agent, int64, error) {
|
||||
return []domain.Agent{
|
||||
{ID: "a-1", Name: "agent-one", Hostname: "host-1",
|
||||
Status: domain.AgentStatusOnline, RegisteredAt: now,
|
||||
APIKeyHash: g2HandlerLeakSentinel + "-1"},
|
||||
{ID: "a-2", Name: "agent-two", Hostname: "host-2",
|
||||
Status: domain.AgentStatusOnline, RegisteredAt: now,
|
||||
APIKeyHash: g2HandlerLeakSentinel + "-2"},
|
||||
}, 2, nil
|
||||
},
|
||||
}
|
||||
h := NewAgentHandler(mock)
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/v1/agents?page=1&per_page=50", nil)
|
||||
req = req.WithContext(contextWithRequestID())
|
||||
w := httptest.NewRecorder()
|
||||
h.ListAgents(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("ListAgents status = %d, want 200", w.Code)
|
||||
}
|
||||
body := w.Body.String()
|
||||
if bytes.Contains([]byte(body), []byte("api_key_hash")) {
|
||||
t.Errorf("ListAgents response leaked \"api_key_hash\" key (G-2 regressed):\n%s", body)
|
||||
}
|
||||
if bytes.Contains([]byte(body), []byte(g2HandlerLeakSentinel)) {
|
||||
t.Errorf("ListAgents response leaked sentinel %q:\n%s", g2HandlerLeakSentinel, body)
|
||||
}
|
||||
// Sanity: the non-leaked fields ARE present (handler did serve real data).
|
||||
for _, want := range []string{"a-1", "a-2", "agent-one", "agent-two"} {
|
||||
if !bytes.Contains([]byte(body), []byte(want)) {
|
||||
t.Errorf("ListAgents response missing expected field %q (handler may not be serving data):\n%s", want, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetAgent_DoesNotLeakAPIKeyHash(t *testing.T) {
|
||||
now := time.Now()
|
||||
mock := &MockAgentService{
|
||||
GetAgentFn: func(id string) (*domain.Agent, error) {
|
||||
return &domain.Agent{
|
||||
ID: id, Name: "single-agent", Hostname: "single.host",
|
||||
Status: domain.AgentStatusOnline, RegisteredAt: now,
|
||||
APIKeyHash: g2HandlerLeakSentinel,
|
||||
}, nil
|
||||
},
|
||||
}
|
||||
h := NewAgentHandler(mock)
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/v1/agents/a-prod-001", nil)
|
||||
req = req.WithContext(contextWithRequestID())
|
||||
w := httptest.NewRecorder()
|
||||
h.GetAgent(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("GetAgent status = %d, want 200, body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
body := w.Body.String()
|
||||
if bytes.Contains([]byte(body), []byte("api_key_hash")) {
|
||||
t.Errorf("GetAgent response leaked \"api_key_hash\" key:\n%s", body)
|
||||
}
|
||||
if bytes.Contains([]byte(body), []byte(g2HandlerLeakSentinel)) {
|
||||
t.Errorf("GetAgent response leaked sentinel:\n%s", body)
|
||||
}
|
||||
if !bytes.Contains([]byte(body), []byte("single-agent")) {
|
||||
t.Errorf("GetAgent response missing the agent name (handler may not be serving data):\n%s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegisterAgent_DoesNotLeakAPIKeyHash(t *testing.T) {
|
||||
// Registration is the most likely path for a freshly-hashed key to
|
||||
// leak: the service mints a new APIKeyHash inside RegisterAgent
|
||||
// (service/agent.go:405) and the handler returns the agent struct
|
||||
// verbatim. Pin that the redaction holds even on a "freshly created"
|
||||
// agent payload.
|
||||
now := time.Now()
|
||||
mock := &MockAgentService{
|
||||
RegisterAgentFn: func(in domain.Agent) (*domain.Agent, error) {
|
||||
return &domain.Agent{
|
||||
ID: "agent-new", Name: in.Name, Hostname: in.Hostname,
|
||||
Status: domain.AgentStatusOnline, RegisteredAt: now,
|
||||
APIKeyHash: g2HandlerLeakSentinel,
|
||||
}, nil
|
||||
},
|
||||
}
|
||||
h := NewAgentHandler(mock)
|
||||
body := bytes.NewBufferString(`{"name":"freshly-registered","hostname":"new.host"}`)
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/agents", body)
|
||||
req = req.WithContext(contextWithRequestID())
|
||||
w := httptest.NewRecorder()
|
||||
h.RegisterAgent(w, req)
|
||||
|
||||
if w.Code != http.StatusCreated {
|
||||
t.Fatalf("RegisterAgent status = %d, want 201, body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
respBody := w.Body.String()
|
||||
if bytes.Contains([]byte(respBody), []byte("api_key_hash")) {
|
||||
t.Errorf("RegisterAgent response leaked \"api_key_hash\" key:\n%s", respBody)
|
||||
}
|
||||
if bytes.Contains([]byte(respBody), []byte(g2HandlerLeakSentinel)) {
|
||||
t.Errorf("RegisterAgent response leaked sentinel:\n%s", respBody)
|
||||
}
|
||||
if !bytes.Contains([]byte(respBody), []byte("agent-new")) {
|
||||
t.Errorf("RegisterAgent response missing the new agent ID (handler may not be serving data):\n%s", respBody)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListRetiredAgents_DoesNotLeakAPIKeyHash(t *testing.T) {
|
||||
// I-004 surface — separate handler from ListAgents; same leak risk.
|
||||
now := time.Now()
|
||||
retiredAt := now.Add(-1 * time.Hour)
|
||||
reason := "test cascade"
|
||||
mock := &MockAgentService{
|
||||
ListRetiredAgentsFn: func(page, perPage int) ([]domain.Agent, int64, error) {
|
||||
return []domain.Agent{
|
||||
{ID: "ret-1", Name: "retired-one", Hostname: "host-r1",
|
||||
Status: domain.AgentStatusOffline, RegisteredAt: now,
|
||||
RetiredAt: &retiredAt, RetiredReason: &reason,
|
||||
APIKeyHash: g2HandlerLeakSentinel},
|
||||
}, 1, nil
|
||||
},
|
||||
}
|
||||
h := NewAgentHandler(mock)
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/v1/agents/retired?page=1&per_page=50", nil)
|
||||
req = req.WithContext(contextWithRequestID())
|
||||
w := httptest.NewRecorder()
|
||||
h.ListRetiredAgents(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("ListRetiredAgents status = %d, want 200, body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
body := w.Body.String()
|
||||
if bytes.Contains([]byte(body), []byte("api_key_hash")) {
|
||||
t.Errorf("ListRetiredAgents response leaked \"api_key_hash\" key:\n%s", body)
|
||||
}
|
||||
if bytes.Contains([]byte(body), []byte(g2HandlerLeakSentinel)) {
|
||||
t.Errorf("ListRetiredAgents response leaked sentinel:\n%s", body)
|
||||
}
|
||||
if !bytes.Contains([]byte(body), []byte("ret-1")) {
|
||||
t.Errorf("ListRetiredAgents response missing the retired agent ID:\n%s", body)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,19 +108,7 @@ func (h AgentHandler) GetAgent(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
agent, err := h.svc.GetAgent(r.Context(), id)
|
||||
if err != nil {
|
||||
// M-1 (P2): route through errToStatus so a repo-level
|
||||
// sql.ErrNoRows (wrapped as repository.ErrNotFound) becomes 404,
|
||||
// but a transient DB failure no longer masquerades as 404 — it
|
||||
// correctly surfaces 500. The pre-M-1 "any error → 404" shortcut
|
||||
// was plausible when Get's only expected failure was "not found",
|
||||
// but the choke point now gives us correct dispatch for free.
|
||||
// Mirrors GetAgentGroup / GetProfile.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to get agent"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Agent not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Agent not found", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -159,20 +147,11 @@ func (h AgentHandler) RegisterAgent(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
created, err := h.svc.RegisterAgent(r.Context(), agent)
|
||||
if err != nil {
|
||||
// M-1 (P2): replace the 3-term substring net
|
||||
// (`"unique"|"duplicate"|"already exists"`) with a typed
|
||||
// errors.Is(err, service.ErrConflict) arm. The service layer now
|
||||
// wraps pg SQLSTATE 23505 duplicate-name violations via
|
||||
// fmt.Errorf("%w: agent name already exists", ErrConflict), so
|
||||
// classification no longer depends on the exact driver wording.
|
||||
// Other errors redact to a generic 500 with slog.Error server-
|
||||
// side diagnostic capture (F-002). Mirrors CreateProfile's
|
||||
// ErrValidation arm pattern, adapted for the conflict case.
|
||||
if errors.Is(err, service.ErrConflict) {
|
||||
errMsg := err.Error()
|
||||
if strings.Contains(errMsg, "unique") || strings.Contains(errMsg, "duplicate") || strings.Contains(errMsg, "already exists") {
|
||||
ErrorWithRequestID(w, http.StatusConflict, "Agent with this name already exists", requestID)
|
||||
return
|
||||
}
|
||||
slog.Error("RegisterAgent failed", "name", agent.Name, "error", err.Error())
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to register agent", requestID)
|
||||
return
|
||||
}
|
||||
@@ -232,15 +211,7 @@ func (h AgentHandler) Heartbeat(w http.ResponseWriter, r *http.Request) {
|
||||
ErrorWithRequestID(w, http.StatusGone, "Agent has been retired", requestID)
|
||||
return
|
||||
}
|
||||
// M-1 (P2): the pre-M-1 `strings.Contains(err.Error(), "not
|
||||
// found")` branch now rides the errToStatus choke point, which
|
||||
// recognizes repository.ErrNotFound via errors.Is. The retired-
|
||||
// agent sentinel is still checked FIRST above so the 410 Gone
|
||||
// short-circuit is never masked by the 404 arm. Any other error
|
||||
// redacts to a generic 500 with slog.Error server-side diagnostic
|
||||
// capture (F-002). Mirrors GetAgentGroup.
|
||||
status := errToStatus(err)
|
||||
if status == http.StatusNotFound {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Agent not found", requestID)
|
||||
return
|
||||
}
|
||||
@@ -337,16 +308,7 @@ func (h AgentHandler) AgentCertificatePickup(w http.ResponseWriter, r *http.Requ
|
||||
|
||||
certPEM, err := h.svc.CertificatePickup(r.Context(), agentID, certID)
|
||||
if err != nil {
|
||||
// M-1 (P2): route through errToStatus so a repo-level
|
||||
// sql.ErrNoRows (wrapped as repository.ErrNotFound) becomes 404,
|
||||
// but a transient DB failure no longer masquerades as 404 — it
|
||||
// correctly surfaces 500. Mirrors GetAgent.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to retrieve certificate"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Certificate not found or not ready"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found or not ready", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -529,16 +491,7 @@ func (h AgentHandler) RetireAgent(w http.ResponseWriter, r *http.Request) {
|
||||
JSON(w, http.StatusConflict, body)
|
||||
return
|
||||
}
|
||||
// M-1 (P2): the pre-M-1 `strings.Contains(err.Error(), "not
|
||||
// found")` branch now rides the errToStatus choke point, which
|
||||
// recognizes repository.ErrNotFound via errors.Is. The sentinel
|
||||
// (ErrAgentIsSentinel, ErrForceReasonRequired) and typed
|
||||
// (*BlockedByDependenciesError) checks above still run FIRST so
|
||||
// the 403/400/409 structural refusals are never masked by the
|
||||
// 404 arm. Any other error redacts to a generic 500 with
|
||||
// slog.Error server-side diagnostic capture (F-002).
|
||||
status := errToStatus(err)
|
||||
if status == http.StatusNotFound {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Agent not found", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ package handler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -87,24 +86,7 @@ func (h AuditHandler) GetAuditEvent(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
event, err := h.svc.GetAuditEvent(r.Context(), id)
|
||||
if err != nil {
|
||||
// M-1 (P2): dispatch routes through errToStatus. Pre-M-1 this branch was
|
||||
// a blanket `any error → 404 Audit event not found` shortcut — which
|
||||
// silently demoted transient DB failures from the service's auditRepo.List
|
||||
// wrap to 404 Not Found with no observable external signal. Post-M-1:
|
||||
// service/audit.go GetAuditEvent only wraps the genuine zero-events path
|
||||
// with service.ErrNotFound via %w, and the repo.List wrap surfaces without
|
||||
// that sentinel — so errors.Is(err, service.ErrNotFound) picks up the real
|
||||
// 404s and everything else correctly surfaces as 500 with server-side
|
||||
// slog.Error capture (F-002 redacted-500 pattern preserved).
|
||||
status := errToStatus(err)
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("GetAuditEvent failed", "audit_event_id", id, "error", err.Error())
|
||||
}
|
||||
msg := "Failed to get audit event"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Audit event not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Audit event not found", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -298,13 +298,12 @@ func (h CertificateHandler) UpdateCertificate(w http.ResponseWriter, r *http.Req
|
||||
|
||||
updated, err := h.svc.UpdateCertificate(r.Context(), id, cert)
|
||||
if err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("UpdateCertificate failed", "cert_id", id, "error", err)
|
||||
msg = "internal error"
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
slog.Error("UpdateCertificate failed", "cert_id", id, "error", err.Error())
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to update certificate", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -328,13 +327,11 @@ func (h CertificateHandler) ArchiveCertificate(w http.ResponseWriter, r *http.Re
|
||||
}
|
||||
|
||||
if err := h.svc.ArchiveCertificate(r.Context(), id); err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("ArchiveCertificate failed", "cert_id", id, "error", err)
|
||||
msg = "internal error"
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to archive certificate", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -376,13 +373,12 @@ func (h CertificateHandler) GetCertificateVersions(w http.ResponseWriter, r *htt
|
||||
|
||||
versions, total, err := h.svc.GetCertificateVersions(r.Context(), certID, page, perPage)
|
||||
if err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("GetCertificateVersions failed", "cert_id", certID, "error", err)
|
||||
msg = "internal error"
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
slog.Error("GetCertificateVersions failed", "cert_id", certID, "error", err.Error())
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to get certificate versions", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -418,13 +414,20 @@ func (h CertificateHandler) TriggerRenewal(w http.ResponseWriter, r *http.Reques
|
||||
actor := resolveActor(r.Context())
|
||||
|
||||
if err := h.svc.TriggerRenewal(r.Context(), certID, actor); err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("TriggerRenewal failed", "cert_id", certID, "error", err)
|
||||
msg = "internal error"
|
||||
errMsg := err.Error()
|
||||
if strings.Contains(errMsg, "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
if strings.Contains(errMsg, "cannot renew") {
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, errMsg, requestID)
|
||||
return
|
||||
}
|
||||
if strings.Contains(errMsg, "already in progress") {
|
||||
ErrorWithRequestID(w, http.StatusConflict, errMsg, requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to trigger renewal", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -513,13 +516,19 @@ func (h CertificateHandler) RevokeCertificate(w http.ResponseWriter, r *http.Req
|
||||
actor := resolveActor(r.Context())
|
||||
|
||||
if err := h.svc.RevokeCertificate(r.Context(), certID, req.Reason, actor); err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("RevokeCertificate failed", "cert_id", certID, "error", err)
|
||||
msg = "internal error"
|
||||
// Distinguish between client errors and server errors
|
||||
errMsg := err.Error()
|
||||
if strings.Contains(errMsg, "already revoked") ||
|
||||
strings.Contains(errMsg, "cannot revoke") ||
|
||||
strings.Contains(errMsg, "invalid revocation reason") {
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, errMsg, requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
if strings.Contains(errMsg, "not found") || strings.Contains(errMsg, "failed to fetch") || strings.Contains(errMsg, "failed to get") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to revoke certificate", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -548,13 +557,16 @@ func (h CertificateHandler) GetDERCRL(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
derBytes, err := h.svc.GenerateDERCRL(r.Context(), issuerID)
|
||||
if err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("GenerateDERCRL failed", "issuer_id", issuerID, "error", err)
|
||||
msg = "internal error"
|
||||
errMsg := err.Error()
|
||||
if strings.Contains(errMsg, "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, errMsg, requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
if strings.Contains(errMsg, "do not support") || strings.Contains(errMsg, "does not support") {
|
||||
ErrorWithRequestID(w, http.StatusNotImplemented, errMsg, requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to generate CRL", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -590,13 +602,16 @@ func (h CertificateHandler) HandleOCSP(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
derBytes, err := h.svc.GetOCSPResponse(r.Context(), issuerID, serialHex)
|
||||
if err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("GetOCSPResponse failed", "issuer_id", issuerID, "serial", serialHex, "error", err)
|
||||
msg = "internal error"
|
||||
errMsg := err.Error()
|
||||
if strings.Contains(errMsg, "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, errMsg, requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
if strings.Contains(errMsg, "do not support") || strings.Contains(errMsg, "does not support") {
|
||||
ErrorWithRequestID(w, http.StatusNotImplemented, errMsg, requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to generate OCSP response", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -627,13 +642,12 @@ func (h CertificateHandler) GetCertificateDeployments(w http.ResponseWriter, r *
|
||||
|
||||
deployments, err := h.svc.GetCertificateDeployments(r.Context(), certID)
|
||||
if err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("GetCertificateDeployments failed", "cert_id", certID, "error", err)
|
||||
msg = "internal error"
|
||||
errMsg := err.Error()
|
||||
if strings.Contains(errMsg, "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to get deployments", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net/http"
|
||||
|
||||
"github.com/lib/pq"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/repository"
|
||||
"github.com/shankar0123/certctl/internal/service"
|
||||
)
|
||||
|
||||
// errToStatus is the single choke point that maps a service-layer or
|
||||
// repository-layer error to its HTTP status code. Before M-1 (P2), 42 switch
|
||||
// branches across 11 handler files classified errors via
|
||||
// `strings.Contains(err.Error(), ...)` substring matching — a pattern that
|
||||
// made every HTTP status mapping one sentinel-message reword away from silent
|
||||
// regression (see M-003 self-approval privilege boundary: a reword of
|
||||
// ErrSelfApproval.Error() would have demoted 403 Forbidden to 500 Internal
|
||||
// Server Error with no compile-time error, no test failure, and no observable
|
||||
// external signal).
|
||||
//
|
||||
// All handler branches now route through this function via errors.Is and
|
||||
// errors.As, which walks the wrap chain built by fmt.Errorf("%w: ...", ...).
|
||||
// The generic sentinels live in internal/service/errors.go; domain-specific
|
||||
// sentinels (ErrSelfApproval, ErrAgentIsSentinel, ErrBlockedByDependencies,
|
||||
// ErrForceReasonRequired, ErrAgentNotFound) wrap those generics via %w so both
|
||||
// errors.Is(err, ErrSelfApproval) and errors.Is(err, ErrForbidden) succeed on
|
||||
// the same wrapped error.
|
||||
//
|
||||
// # Dispatch order
|
||||
//
|
||||
// 1. ErrAgentRetired → 410 Gone. Tested FIRST. It is deliberately NOT wrapped
|
||||
// under any generic sentinel — 410 Gone is semantically distinct from
|
||||
// 403/404/409 (permanently-terminated resource identity that drives
|
||||
// deterministic agent-binary shutdown at cmd/agent/main.go:1291). Must
|
||||
// short-circuit before any generic check so wrapping can never demote it.
|
||||
// 2. ErrNotFound → 404 Not Found. Both service.ErrNotFound and
|
||||
// repository.ErrNotFound route here — repositories wrap sql.ErrNoRows with
|
||||
// repository.ErrNotFound so a "row not found" escapes the repo layer as a
|
||||
// typed sentinel rather than an untyped fmt.Errorf string. Tested BEFORE
|
||||
// ErrForbidden so RFC 7235's preference for hiding resource existence from
|
||||
// unauthorized callers is preserved (a caller who cannot see a resource
|
||||
// should get 404, not 403).
|
||||
// 3. ErrUnauthenticated → 401 Unauthorized. SCEP challenge-password mismatch
|
||||
// and similar credential failures.
|
||||
// 4. ErrForbidden → 403 Forbidden. M-003 gate. Tested BEFORE ErrValidation so
|
||||
// double-wrapping (e.g., a future fmt.Errorf("%w: ctx", ErrSelfApproval)
|
||||
// in a wrapping call site) cannot demote 403 to 400.
|
||||
// 5. ErrConflict / repository.ErrRenewalPolicyDuplicateName /
|
||||
// repository.ErrRenewalPolicyInUse → 409 Conflict. The repo-layer sentinels
|
||||
// are routed here explicitly so handlers do not need their own dispatch
|
||||
// tree for G-1's renewal-policy FK + unique-name violations.
|
||||
// 6. ErrValidation → 400 Bad Request. Generic input validation / malformed
|
||||
// request bodies / invalid state transitions that the caller could correct
|
||||
// by changing their request.
|
||||
// 7. ErrUnprocessable → 422 Unprocessable Entity. Distinct from
|
||||
// ErrValidation: ErrValidation is "caller sent bad input" (400), while
|
||||
// ErrUnprocessable is "caller's input was fine but our stored data can't
|
||||
// satisfy the operation" — e.g., an X.509 PEM in the inventory that fails
|
||||
// to decode. The pre-M-1 ExportPKCS12 handler pinned 422 on
|
||||
// strings.Contains(err.Error(), "cannot be parsed"); the sentinel makes
|
||||
// that dispatch survive message rewording.
|
||||
// 8. ErrNotImplemented → 501 Not Implemented. Reserved for feature-flag-gated
|
||||
// code paths.
|
||||
// 9. *pq.Error fallback on SQLSTATE 23503 (FK violation) / 23505 (unique
|
||||
// violation) → 409 Conflict. Final branch before the default 500. Anything
|
||||
// that reaches here is technically a code smell (the repository layer
|
||||
// should normally wrap driver errors into a typed sentinel) but the status
|
||||
// mapping is still correct.
|
||||
//
|
||||
// # Why a function, not a middleware
|
||||
//
|
||||
// Handlers must continue to call [Error] / [ErrorWithRequestID] with a
|
||||
// caller-chosen human-readable message (sometimes the wrapped err.Error(),
|
||||
// sometimes a redacted "internal error" for 500s per F-002). This function
|
||||
// gives handlers the status code; the handler keeps control of the body.
|
||||
func errToStatus(err error) int {
|
||||
if err == nil {
|
||||
return http.StatusOK
|
||||
}
|
||||
|
||||
switch {
|
||||
case errors.Is(err, service.ErrAgentRetired):
|
||||
return http.StatusGone // 410 — must short-circuit before generic dispatch
|
||||
case errors.Is(err, service.ErrNotFound),
|
||||
errors.Is(err, repository.ErrNotFound):
|
||||
return http.StatusNotFound // 404 — before ErrForbidden (RFC 7235 existence hiding)
|
||||
case errors.Is(err, service.ErrUnauthenticated):
|
||||
return http.StatusUnauthorized // 401
|
||||
case errors.Is(err, service.ErrForbidden):
|
||||
return http.StatusForbidden // 403 — before ErrValidation (preserves M-003 gate under double-wrap)
|
||||
case errors.Is(err, service.ErrConflict),
|
||||
errors.Is(err, repository.ErrRenewalPolicyDuplicateName),
|
||||
errors.Is(err, repository.ErrRenewalPolicyInUse):
|
||||
return http.StatusConflict // 409
|
||||
case errors.Is(err, service.ErrValidation):
|
||||
return http.StatusBadRequest // 400
|
||||
case errors.Is(err, service.ErrUnprocessable):
|
||||
return http.StatusUnprocessableEntity // 422 — stored-data-unparseable, not caller-input-bad
|
||||
case errors.Is(err, service.ErrNotImplemented):
|
||||
return http.StatusNotImplemented // 501
|
||||
}
|
||||
|
||||
// Driver-level fallback. Raw *pq.Error escaping the repository layer is a
|
||||
// code smell but a real escape hatch today — we still want a correct 409
|
||||
// instead of a generic 500 for FK/unique violations.
|
||||
var pgErr *pq.Error
|
||||
if errors.As(err, &pgErr) {
|
||||
switch pgErr.Code {
|
||||
case "23503", "23505":
|
||||
return http.StatusConflict
|
||||
}
|
||||
}
|
||||
|
||||
return http.StatusInternalServerError
|
||||
}
|
||||
@@ -1,120 +0,0 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/lib/pq"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/repository"
|
||||
"github.com/shankar0123/certctl/internal/service"
|
||||
)
|
||||
|
||||
// TestErrToStatus_DispatchMatrix pins the handler's single error → HTTP
|
||||
// status choke point. Each row covers one branch of the dispatch switch and
|
||||
// the dispatch order invariants documented in errors.go:
|
||||
//
|
||||
// - ErrAgentRetired FIRST (410 short-circuits before generic checks)
|
||||
// - ErrNotFound before ErrForbidden (RFC 7235 existence hiding)
|
||||
// - ErrForbidden before ErrValidation (preserves M-003 gate under double-wrap)
|
||||
// - Repo sentinels route to 409 alongside ErrConflict
|
||||
// - *pq.Error on 23503 / 23505 routes to 409 as the driver-level fallback
|
||||
// - Default path is 500
|
||||
func TestErrToStatus_DispatchMatrix(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
err error
|
||||
want int
|
||||
}{
|
||||
{"nil → 200", nil, http.StatusOK},
|
||||
|
||||
// Each generic sentinel resolves to its documented status code.
|
||||
{"ErrNotFound → 404", service.ErrNotFound, http.StatusNotFound},
|
||||
{"ErrValidation → 400", service.ErrValidation, http.StatusBadRequest},
|
||||
{"ErrConflict → 409", service.ErrConflict, http.StatusConflict},
|
||||
{"ErrForbidden → 403", service.ErrForbidden, http.StatusForbidden},
|
||||
{"ErrUnauthenticated → 401", service.ErrUnauthenticated, http.StatusUnauthorized},
|
||||
{"ErrNotImplemented → 501", service.ErrNotImplemented, http.StatusNotImplemented},
|
||||
|
||||
// Wrapped domain sentinels route through their generic wrap.
|
||||
{"ErrSelfApproval → 403 (via ErrForbidden)", service.ErrSelfApproval, http.StatusForbidden},
|
||||
{"ErrAgentIsSentinel → 403 (via ErrForbidden)", service.ErrAgentIsSentinel, http.StatusForbidden},
|
||||
{"ErrBlockedByDependencies → 409 (via ErrConflict)", service.ErrBlockedByDependencies, http.StatusConflict},
|
||||
{"ErrForceReasonRequired → 400 (via ErrValidation)", service.ErrForceReasonRequired, http.StatusBadRequest},
|
||||
{"ErrAgentNotFound → 400 (via ErrValidation)", service.ErrAgentNotFound, http.StatusBadRequest},
|
||||
|
||||
// ErrAgentRetired is standalone — 410 Gone must fire before any
|
||||
// generic dispatch. This locks in the semantic-distinct short-circuit.
|
||||
{"ErrAgentRetired → 410", service.ErrAgentRetired, http.StatusGone},
|
||||
|
||||
// Repository-layer sentinels (G-1 + M-1).
|
||||
{"repo.ErrNotFound → 404", repository.ErrNotFound, http.StatusNotFound},
|
||||
{"wrapped repo.ErrNotFound → 404",
|
||||
fmt.Errorf("%w: renewal policy rp-foo", repository.ErrNotFound),
|
||||
http.StatusNotFound},
|
||||
{"repo.ErrRenewalPolicyDuplicateName → 409", repository.ErrRenewalPolicyDuplicateName, http.StatusConflict},
|
||||
{"repo.ErrRenewalPolicyInUse → 409", repository.ErrRenewalPolicyInUse, http.StatusConflict},
|
||||
|
||||
// Wrapped errors with additional context survive the dispatch.
|
||||
{"wrapped ErrNotFound with context → 404",
|
||||
fmt.Errorf("lookup failed: %w", service.ErrNotFound),
|
||||
http.StatusNotFound},
|
||||
{"wrapped ErrSelfApproval with context → 403",
|
||||
fmt.Errorf("approval gate: %w", service.ErrSelfApproval),
|
||||
http.StatusForbidden},
|
||||
|
||||
// Driver-level fallback: raw *pq.Error escaping repo layer.
|
||||
{"*pq.Error 23503 → 409", &pq.Error{Code: "23503"}, http.StatusConflict},
|
||||
{"*pq.Error 23505 → 409", &pq.Error{Code: "23505"}, http.StatusConflict},
|
||||
{"*pq.Error 08006 → 500", &pq.Error{Code: "08006"}, http.StatusInternalServerError},
|
||||
|
||||
// Default path.
|
||||
{"unknown error → 500", errors.New("something arbitrary"), http.StatusInternalServerError},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got := errToStatus(c.err)
|
||||
if got != c.want {
|
||||
t.Errorf("errToStatus(%v) = %d, want %d", c.err, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestErrToStatus_AgentRetiredShortCircuit is a dedicated regression guard
|
||||
// for the most fragile dispatch invariant: ErrAgentRetired's 410 Gone must
|
||||
// fire FIRST. If a future commit wraps it under ErrForbidden (e.g., to
|
||||
// include it in a generic "agent operations forbidden" bucket), this test
|
||||
// goes red and the agent-binary shutdown at cmd/agent/main.go:1291 would
|
||||
// silently stop triggering.
|
||||
func TestErrToStatus_AgentRetiredShortCircuit(t *testing.T) {
|
||||
if got := errToStatus(service.ErrAgentRetired); got != http.StatusGone {
|
||||
t.Fatalf("ErrAgentRetired → %d, want 410 Gone (short-circuit must fire before any generic dispatch)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestErrToStatus_NotFoundBeforeForbidden locks the RFC 7235 existence-
|
||||
// hiding dispatch order. If someone were to reorder the switch arms to put
|
||||
// ErrForbidden first, an authorization failure on a nonexistent resource
|
||||
// would leak existence via a 403 instead of masking it with a 404.
|
||||
func TestErrToStatus_NotFoundBeforeForbidden(t *testing.T) {
|
||||
// A hypothetical wrapping where both would match — contrived but the
|
||||
// ordering guarantee is what we're testing.
|
||||
both := fmt.Errorf("%w: layered with %w", service.ErrNotFound, service.ErrForbidden)
|
||||
if got := errToStatus(both); got != http.StatusNotFound {
|
||||
t.Errorf("dual-wrapped err → %d, want 404 (ErrNotFound must dispatch before ErrForbidden)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestErrToStatus_ForbiddenBeforeValidation guards the M-003 self-approval
|
||||
// gate against a future call site that double-wraps ErrSelfApproval under
|
||||
// ErrValidation (intentionally or accidentally). The dispatch must pick
|
||||
// 403, not 400.
|
||||
func TestErrToStatus_ForbiddenBeforeValidation(t *testing.T) {
|
||||
doubled := fmt.Errorf("%w: %w", service.ErrSelfApproval, service.ErrValidation)
|
||||
if got := errToStatus(doubled); got != http.StatusForbidden {
|
||||
t.Errorf("double-wrapped err → %d, want 403 (ErrForbidden must dispatch before ErrValidation — M-003 gate)", got)
|
||||
}
|
||||
}
|
||||
@@ -46,26 +46,12 @@ func (h ExportHandler) ExportPEM(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
result, err := h.svc.ExportPEM(r.Context(), id)
|
||||
if err != nil {
|
||||
// M-1 (P2): dispatch routes through errToStatus. Pre-M-1 this branch
|
||||
// classified 404 via strings.Contains(err.Error(), "not found"), which
|
||||
// gave false positives on any error whose rendered text happened to
|
||||
// contain "not found" — notably a transient DB failure when the service
|
||||
// layer wrapped every certRepo.Get error with "certificate not found".
|
||||
// Post-M-1: service/export.go now wraps with "failed to get certificate"
|
||||
// and only the genuine sql.ErrNoRows path surfaces repository.ErrNotFound
|
||||
// through the wrap chain, so errors.Is(err, repository.ErrNotFound) picks
|
||||
// up the real 404s and everything else — including transient DB errors —
|
||||
// correctly surfaces as 500 with server-side slog.Error capture (F-002
|
||||
// redacted-500 pattern preserved).
|
||||
status := errToStatus(err)
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("ExportPEM failed", "cert_id", id, "error", err.Error())
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found", requestID)
|
||||
return
|
||||
}
|
||||
msg := "Failed to export certificate"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Certificate not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
slog.Error("ExportPEM failed", "cert_id", id, "error", err.Error())
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to export certificate", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -108,32 +94,16 @@ func (h ExportHandler) ExportPKCS12(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
pfxData, err := h.svc.ExportPKCS12(r.Context(), id, req.Password)
|
||||
if err != nil {
|
||||
// M-1 (P2): dispatch routes through errToStatus. The pre-M-1 3-term
|
||||
// substring net (`"not found"|"cannot be parsed"|"no certificates
|
||||
// found"`) is replaced with sentinel dispatch:
|
||||
// - repository.ErrNotFound (from certificate.go Get/GetLatestVersion
|
||||
// sql.ErrNoRows wrap) → 404
|
||||
// - service.ErrUnprocessable (from service/export.go ExportPKCS12's
|
||||
// parsePEMCertificates-failure and empty-chain wraps) → 422 —
|
||||
// semantically correct because the caller's request is fine; our
|
||||
// stored PEM chain is what cannot be processed
|
||||
// - everything else → 500 with slog.Error capture (F-002 redacted-500
|
||||
// pattern preserved)
|
||||
// A transient DB failure that pre-M-1 would have been swept into the
|
||||
// 404 substring branch (because the service wrapped every certRepo.Get
|
||||
// error with "certificate not found") now correctly surfaces as 500.
|
||||
status := errToStatus(err)
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("ExportPKCS12 failed", "cert_id", id, "error", err.Error())
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Certificate not found", requestID)
|
||||
return
|
||||
}
|
||||
msg := "Failed to export PKCS#12"
|
||||
switch status {
|
||||
case http.StatusNotFound:
|
||||
msg = "Certificate not found"
|
||||
case http.StatusUnprocessableEntity:
|
||||
msg = "Certificate data cannot be parsed as X.509"
|
||||
if strings.Contains(err.Error(), "cannot be parsed") || strings.Contains(err.Error(), "no certificates found") {
|
||||
ErrorWithRequestID(w, http.StatusUnprocessableEntity, "Certificate data cannot be parsed as X.509", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
slog.Error("ExportPKCS12 failed", "cert_id", id, "error", err.Error())
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to export PKCS#12", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -108,17 +108,9 @@ func TestExportPEM_Download(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportPEM_NotFound(t *testing.T) {
|
||||
// M-1 (P2): wrap with service.ErrNotFound via %w so the handler's
|
||||
// errToStatus choke point dispatches to 404 via errors.Is. Pre-M-1 this
|
||||
// test used a raw `fmt.Errorf("certificate not found")` string and relied
|
||||
// on the handler's strings.Contains(err.Error(), "not found") classifier
|
||||
// — which was the same mechanism that silently misclassified transient DB
|
||||
// failures whose text happened to include "not found" (see docblock on
|
||||
// ExportPEM handler). Pinning the sentinel contract makes this test
|
||||
// regression-proof against wrap-text changes.
|
||||
mockSvc := &MockExportService{
|
||||
ExportPEMFn: func(_ context.Context, _ string) (*service.ExportPEMResult, error) {
|
||||
return nil, fmt.Errorf("%w: certificate", service.ErrNotFound)
|
||||
return nil, fmt.Errorf("certificate not found")
|
||||
},
|
||||
}
|
||||
h := NewExportHandler(mockSvc)
|
||||
@@ -222,11 +214,9 @@ func TestExportPKCS12_EmptyPassword(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportPKCS12_NotFound(t *testing.T) {
|
||||
// M-1 (P2): same sentinel migration as TestExportPEM_NotFound — see
|
||||
// rationale there.
|
||||
mockSvc := &MockExportService{
|
||||
ExportPKCS12Fn: func(_ context.Context, _ string, _ string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("%w: certificate", service.ErrNotFound)
|
||||
return nil, fmt.Errorf("certificate not found")
|
||||
},
|
||||
}
|
||||
h := NewExportHandler(mockSvc)
|
||||
@@ -241,31 +231,6 @@ func TestExportPKCS12_NotFound(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestExportPKCS12_Unprocessable pins the M-1 (P2) 422 contract: when the
|
||||
// service layer wraps a parse failure with service.ErrUnprocessable, the
|
||||
// handler's errToStatus choke point must dispatch to 422 Unprocessable
|
||||
// Entity. Pre-M-1 this was classified via a 2-term substring net
|
||||
// (`"cannot be parsed"|"no certificates found"`) at export.go:101, which
|
||||
// would have been silently broken by a message reword in service/export.go.
|
||||
// The new sentinel makes the dispatch survive message rewording.
|
||||
func TestExportPKCS12_Unprocessable(t *testing.T) {
|
||||
mockSvc := &MockExportService{
|
||||
ExportPKCS12Fn: func(_ context.Context, _ string, _ string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("%w: certificate data cannot be parsed as X.509: asn1 decode error", service.ErrUnprocessable)
|
||||
},
|
||||
}
|
||||
h := NewExportHandler(mockSvc)
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/v1/certificates/mc-test-1/export/pkcs12", nil)
|
||||
w := httptest.NewRecorder()
|
||||
|
||||
h.ExportPKCS12(w, req)
|
||||
|
||||
if w.Code != http.StatusUnprocessableEntity {
|
||||
t.Fatalf("expected 422, got %d", w.Code)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportPKCS12_ServiceError(t *testing.T) {
|
||||
mockSvc := &MockExportService{
|
||||
ExportPKCS12Fn: func(_ context.Context, _ string, _ string) ([]byte, error) {
|
||||
|
||||
@@ -7,8 +7,14 @@ import (
|
||||
)
|
||||
|
||||
// HealthHandler handles health and readiness check endpoints.
|
||||
//
|
||||
// G-1 (P1): AuthType is one of "api-key" or "none" — see
|
||||
// internal/config.AuthType / config.ValidAuthTypes() for the typed
|
||||
// constants and the rationale for dropping "jwt" (no JWT middleware
|
||||
// ships with certctl; operators who need JWT/OIDC front certctl with
|
||||
// an authenticating gateway and set AuthType="none" on the upstream).
|
||||
type HealthHandler struct {
|
||||
AuthType string // "api-key", "jwt", "none"
|
||||
AuthType string // "api-key" or "none" (see config.AuthType constants)
|
||||
}
|
||||
|
||||
// NewHealthHandler creates a new HealthHandler.
|
||||
|
||||
@@ -162,30 +162,14 @@ func TestAuthInfo_ReturnsAuthType_None(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthInfo_ReturnsAuthType_JWT(t *testing.T) {
|
||||
handler := NewHealthHandler("jwt")
|
||||
|
||||
req, err := http.NewRequest(http.MethodGet, "/api/v1/auth/info", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("NewRequest failed: %v", err)
|
||||
}
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
handler.AuthInfo(w, req)
|
||||
|
||||
var result map[string]interface{}
|
||||
if err := json.NewDecoder(w.Body).Decode(&result); err != nil {
|
||||
t.Fatalf("failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
if result["auth_type"] != "jwt" {
|
||||
t.Errorf("auth_type = %q, want jwt", result["auth_type"])
|
||||
}
|
||||
|
||||
if required, ok := result["required"].(bool); !ok || !required {
|
||||
t.Errorf("required = %v, want true", result["required"])
|
||||
}
|
||||
}
|
||||
// G-1 (P1): the prior `TestAuthInfo_ReturnsAuthType_JWT` asserted the
|
||||
// handler echoed "jwt" — using the silent-auth-downgrade value as a
|
||||
// test fixture, which baked the lie into the regression suite. The
|
||||
// test is removed because "jwt" is now rejected at config-load time
|
||||
// (see internal/config/config_test.go::TestValidate_JWTAuth_RejectedDedicated)
|
||||
// and never reaches this handler. The pre-existing
|
||||
// `TestAuthInfo_ReturnsAuthType_APIKey` above (line ~107) covers the
|
||||
// api-key happy path; nothing else needs replacing here.
|
||||
|
||||
func TestAuthCheck_ReturnsOK(t *testing.T) {
|
||||
handler := NewHealthHandler("api-key")
|
||||
|
||||
@@ -135,13 +135,16 @@ func (h IssuerHandler) CreateIssuer(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
created, err := h.svc.CreateIssuer(r.Context(), issuer)
|
||||
if err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
h.logger.Error("failed to create issuer", "error", err, "name", issuer.Name, "type", issuer.Type)
|
||||
msg = "internal error"
|
||||
h.logger.Error("failed to create issuer", "error", err, "name", issuer.Name, "type", issuer.Type)
|
||||
errMsg := err.Error()
|
||||
switch {
|
||||
case strings.Contains(errMsg, "unique") || strings.Contains(errMsg, "duplicate"):
|
||||
ErrorWithRequestID(w, http.StatusConflict, "An issuer with this name already exists", requestID)
|
||||
case strings.Contains(errMsg, "unsupported issuer type"):
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, errMsg, requestID)
|
||||
default:
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to create issuer", requestID)
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -174,13 +177,16 @@ func (h IssuerHandler) UpdateIssuer(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
updated, err := h.svc.UpdateIssuer(r.Context(), id, issuer)
|
||||
if err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
h.logger.Error("failed to update issuer", "error", err, "id", id)
|
||||
msg = "internal error"
|
||||
h.logger.Error("failed to update issuer", "error", err, "id", id)
|
||||
errMsg := err.Error()
|
||||
switch {
|
||||
case strings.Contains(errMsg, "unique") || strings.Contains(errMsg, "duplicate"):
|
||||
ErrorWithRequestID(w, http.StatusConflict, "An issuer with this name already exists", requestID)
|
||||
case strings.Contains(errMsg, "not found"):
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Issuer not found", requestID)
|
||||
default:
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to update issuer", requestID)
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -204,13 +210,13 @@ func (h IssuerHandler) DeleteIssuer(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
if err := h.svc.DeleteIssuer(r.Context(), id); err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
h.logger.Error("DeleteIssuer failed", "issuer_id", id, "error", err)
|
||||
msg = "internal error"
|
||||
if strings.Contains(err.Error(), "violates foreign key") || strings.Contains(err.Error(), "RESTRICT") {
|
||||
ErrorWithRequestID(w, http.StatusConflict, "Cannot delete issuer: certificates are still using this issuer", requestID)
|
||||
} else if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Issuer not found", requestID)
|
||||
} else {
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to delete issuer", requestID)
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -3,14 +3,15 @@ package handler
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/api/middleware"
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/service"
|
||||
)
|
||||
|
||||
// JobService defines the service interface for job operations.
|
||||
@@ -159,13 +160,22 @@ func (h JobHandler) ApproveJob(w http.ResponseWriter, r *http.Request) {
|
||||
actor := resolveActor(r.Context())
|
||||
|
||||
if err := h.svc.ApproveJob(r.Context(), jobID, actor); err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("ApproveJob failed", "job_id", jobID, "error", err)
|
||||
msg = "internal error"
|
||||
// M-003: self-approval by the certificate owner is forbidden.
|
||||
if errors.Is(err, service.ErrSelfApproval) {
|
||||
ErrorWithRequestID(w, http.StatusForbidden,
|
||||
"Self-approval is forbidden: the certificate owner cannot approve their own renewal",
|
||||
requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Job not found", requestID)
|
||||
return
|
||||
}
|
||||
if strings.Contains(err.Error(), "cannot approve") {
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, err.Error(), requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to approve job", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -203,13 +213,15 @@ func (h JobHandler) RejectJob(w http.ResponseWriter, r *http.Request) {
|
||||
actor := resolveActor(r.Context())
|
||||
|
||||
if err := h.svc.RejectJob(r.Context(), jobID, body.Reason, actor); err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("RejectJob failed", "job_id", jobID, "error", err)
|
||||
msg = "internal error"
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Job not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
if strings.Contains(err.Error(), "cannot reject") {
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, err.Error(), requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to reject job", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@ package handler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -108,25 +107,7 @@ func (h NotificationHandler) GetNotification(w http.ResponseWriter, r *http.Requ
|
||||
|
||||
notification, err := h.svc.GetNotification(r.Context(), id)
|
||||
if err != nil {
|
||||
// M-1 (P2): dispatch routes through errToStatus. Pre-M-1 this branch was
|
||||
// a blanket `any error → 404 Notification not found` shortcut — which
|
||||
// silently demoted transient DB failures from the service's List() wrap
|
||||
// (notification.go:386 pre-M-1) to 404 Not Found with no observable
|
||||
// external signal. Post-M-1: service/notification.go GetNotification only
|
||||
// wraps the genuine "not found" path with service.ErrNotFound via %w, and
|
||||
// every other error (including the List() wrap) surfaces without that
|
||||
// sentinel — so errors.Is(err, service.ErrNotFound) picks up the real
|
||||
// 404s and everything else correctly surfaces as 500 with server-side
|
||||
// slog.Error capture (F-002 redacted-500 pattern preserved).
|
||||
status := errToStatus(err)
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("GetNotification failed", "notification_id", id, "error", err.Error())
|
||||
}
|
||||
msg := "Failed to get notification"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Notification not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Notification not found", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -189,26 +170,11 @@ func (h NotificationHandler) RequeueNotification(w http.ResponseWriter, r *http.
|
||||
notificationID := parts[0]
|
||||
|
||||
if err := h.svc.RequeueNotification(r.Context(), notificationID); err != nil {
|
||||
// M-1 (P2): dispatch routes through errToStatus. Pre-M-1 this branch
|
||||
// classified 404 via strings.Contains(err.Error(), "not found"), which
|
||||
// would have given false positives on any error whose rendered text
|
||||
// happened to contain "not found" — notably a transient DB failure whose
|
||||
// driver message mentioned a missing relation or column. Post-M-1: the
|
||||
// repo-layer Requeue (postgres/notification.go) wraps zero-rows-affected
|
||||
// with repository.ErrNotFound via %w, and the service layer forwards
|
||||
// that error with %w — so errors.Is(err, repository.ErrNotFound) walks
|
||||
// the full wrap chain and picks up the real 404s while everything else
|
||||
// (including transient DB errors) correctly surfaces as 500 with
|
||||
// server-side slog.Error capture (F-002 redacted-500 pattern preserved).
|
||||
status := errToStatus(err)
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("RequeueNotification failed", "notification_id", notificationID, "error", err.Error())
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Notification not found", requestID)
|
||||
return
|
||||
}
|
||||
msg := "Failed to requeue notification"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Notification not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to requeue notification", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ package handler
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -185,13 +184,13 @@ func (h OwnerHandler) DeleteOwner(w http.ResponseWriter, r *http.Request) {
|
||||
id = parts[0]
|
||||
|
||||
if err := h.svc.DeleteOwner(r.Context(), id); err != nil {
|
||||
status := errToStatus(err)
|
||||
msg := err.Error()
|
||||
if status == http.StatusInternalServerError {
|
||||
slog.Error("DeleteOwner failed", "owner_id", id, "error", err)
|
||||
msg = "internal error"
|
||||
if strings.Contains(err.Error(), "violates foreign key") || strings.Contains(err.Error(), "RESTRICT") {
|
||||
ErrorWithRequestID(w, http.StatusConflict, "Cannot delete owner: certificates are still assigned to this owner", requestID)
|
||||
} else if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Owner not found", requestID)
|
||||
} else {
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to delete owner", requestID)
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -3,14 +3,12 @@ package handler
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/api/middleware"
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/service"
|
||||
)
|
||||
|
||||
// ProfileService defines the service interface for certificate profile operations.
|
||||
@@ -23,20 +21,6 @@ type ProfileService interface {
|
||||
}
|
||||
|
||||
// ProfileHandler handles HTTP requests for certificate profile operations.
|
||||
//
|
||||
// Error dispatch (post-M-1): every service error routes through the [errToStatus]
|
||||
// choke point via `errors.Is` walking the wrap chain, with one explicit
|
||||
// [service.ErrValidation] arm on the write paths (Create, Update) so the
|
||||
// composed "validation: <field-specific reason>" message the service layer
|
||||
// attaches via `fmt.Errorf("%w: ...", ErrValidation)` can be passed through to
|
||||
// the 400 response body. Before M-1, the Create and Update handlers branched on
|
||||
// `strings.Contains(err.Error(), "invalid"|"required"|"must be"|"cannot")` — a
|
||||
// fragile pattern where a single reword in [service.validateProfile] would
|
||||
// demote the 400 to 500 with no compile-time signal. The substring-based 404
|
||||
// branches on Update and Delete likewise depended on the repository's
|
||||
// human-readable "profile not found" message surviving forever; both now ride
|
||||
// the same [repository.ErrNotFound] wrap that G-1's renewal-policy and M-1's
|
||||
// other repositories use.
|
||||
type ProfileHandler struct {
|
||||
svc ProfileService
|
||||
}
|
||||
@@ -104,18 +88,7 @@ func (h ProfileHandler) GetProfile(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
profile, err := h.svc.GetProfile(r.Context(), id)
|
||||
if err != nil {
|
||||
// M-1: route through errToStatus so a repo-level `sql.ErrNoRows`
|
||||
// (wrapped as repository.ErrNotFound) becomes 404, but a transient DB
|
||||
// failure no longer masquerades as 404 — it correctly surfaces 500. The
|
||||
// pre-M-1 "any error → 404" shortcut was plausible when Get's only
|
||||
// expected failure was "not found", but the choke point now gives us
|
||||
// correct dispatch for free. Mirrors GetRenewalPolicy.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to get profile"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Profile not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Profile not found", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -150,15 +123,9 @@ func (h ProfileHandler) CreateProfile(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
created, err := h.svc.CreateProfile(r.Context(), profile)
|
||||
if err != nil {
|
||||
// M-1: replace the 4-term substring net
|
||||
// (`"invalid"|"required"|"must be"|"cannot"`) with a single
|
||||
// `errors.Is(err, service.ErrValidation)` arm. validateProfile wraps
|
||||
// every field-specific failure via `fmt.Errorf("%w: <reason>",
|
||||
// ErrValidation)`, so `err.Error()` still contains the human-readable
|
||||
// reason (e.g., "RSA minimum key size must be at least 2048") and can be
|
||||
// safely passed to the 400 body — but the status decision no longer
|
||||
// depends on the exact wording. Other errors redact to a generic 500.
|
||||
if errors.Is(err, service.ErrValidation) {
|
||||
// Check if it's a validation error from the service
|
||||
if strings.Contains(err.Error(), "invalid") || strings.Contains(err.Error(), "required") ||
|
||||
strings.Contains(err.Error(), "must be") || strings.Contains(err.Error(), "cannot") {
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, err.Error(), requestID)
|
||||
return
|
||||
}
|
||||
@@ -195,21 +162,16 @@ func (h ProfileHandler) UpdateProfile(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
updated, err := h.svc.UpdateProfile(r.Context(), id, profile)
|
||||
if err != nil {
|
||||
// M-1: explicit ErrValidation arm preserves the user-facing reason in
|
||||
// the 400 body (validateProfile wraps every failure via
|
||||
// `fmt.Errorf("%w: ...", ErrValidation)`); every other error — including
|
||||
// repo-layer ErrNotFound on a missing row — routes through errToStatus
|
||||
// so the 404/500 decision no longer depends on substring matching.
|
||||
if errors.Is(err, service.ErrValidation) {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Profile not found", requestID)
|
||||
return
|
||||
}
|
||||
if strings.Contains(err.Error(), "invalid") || strings.Contains(err.Error(), "required") ||
|
||||
strings.Contains(err.Error(), "must be") || strings.Contains(err.Error(), "cannot") {
|
||||
ErrorWithRequestID(w, http.StatusBadRequest, err.Error(), requestID)
|
||||
return
|
||||
}
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to update profile"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Profile not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to update profile", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -233,14 +195,11 @@ func (h ProfileHandler) DeleteProfile(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
if err := h.svc.DeleteProfile(r.Context(), id); err != nil {
|
||||
// M-1: sentinel dispatch replaces the substring 404 check — see the
|
||||
// parallel comment block in UpdateProfile for the rationale.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to delete profile"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Profile not found"
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Profile not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to delete profile", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -26,28 +26,14 @@ type RenewalPolicyService interface {
|
||||
|
||||
// RenewalPolicyHandler serves /api/v1/renewal-policies CRUD endpoints.
|
||||
//
|
||||
// Error dispatch (post-M-1): every service error routes through the [errToStatus]
|
||||
// choke point via `errors.Is` walking the wrap chain. Three sentinel identities
|
||||
// cover the full dispatch surface:
|
||||
//
|
||||
// - [service.ErrRenewalPolicyDuplicateName] / [service.ErrRenewalPolicyInUse]
|
||||
// are `var`-aliased to the repository-layer sentinels of the same name (G-1),
|
||||
// so handler-side `errors.Is` succeeds against a sentinel raised three layers
|
||||
// deep in [internal/repository/postgres.RenewalPolicyRepository] without the
|
||||
// service layer having to translate. [errToStatus] routes both to 409.
|
||||
// - [repository.ErrNotFound] is wrapped around `sql.ErrNoRows` inside the
|
||||
// repo's Get/Update/Delete methods via `fmt.Errorf("%w: renewal policy %s",
|
||||
// repository.ErrNotFound, id)` (M-1). [errToStatus] routes that to 404 in
|
||||
// the same switch arm as [service.ErrNotFound], preserving the existing
|
||||
// 404-on-missing behavior that the pre-M-1 substring check provided —
|
||||
// without the rewording-regression risk that motivated the migration.
|
||||
//
|
||||
// The handler layer keeps two explicit `errors.Is` arms for the
|
||||
// duplicate-name / in-use sentinels so each 409 response can carry a
|
||||
// constraint-specific human-readable message ("with that name" vs. "still
|
||||
// referenced by managed certificates"); every other error path — including
|
||||
// not-found — delegates the status decision to [errToStatus] and provides a
|
||||
// generic body via the F-002 redacted-500 pattern.
|
||||
// G-1 design note: the service-level `ErrRenewalPolicyDuplicateName` /
|
||||
// `ErrRenewalPolicyInUse` sentinels alias the repository sentinels (same var
|
||||
// identity), so `errors.Is` walks transparently across layers. Delete/Update
|
||||
// not-found detection intentionally uses a `strings.Contains(err.Error(),
|
||||
// "not found")` substring check — the repo wraps `sql.ErrNoRows` as
|
||||
// `fmt.Errorf("renewal policy not found: %s", id)` which strips the sentinel,
|
||||
// and the handler red-tests' `ErrMockNotFound = errors.New("mock not found
|
||||
// error")` follows the same substring convention.
|
||||
type RenewalPolicyHandler struct {
|
||||
svc RenewalPolicyService
|
||||
}
|
||||
@@ -119,18 +105,11 @@ func (h RenewalPolicyHandler) GetRenewalPolicy(w http.ResponseWriter, r *http.Re
|
||||
|
||||
policy, err := h.svc.GetRenewalPolicy(r.Context(), id)
|
||||
if err != nil {
|
||||
// M-1: route through errToStatus so a repo-level `sql.ErrNoRows`
|
||||
// (wrapped as repository.ErrNotFound) becomes 404, but a transient DB
|
||||
// failure no longer masquerades as 404 — it correctly surfaces 500.
|
||||
// The pre-M-1 "any error → 404" shortcut was plausible when Get's only
|
||||
// expected failure was "not found", but the choke point now gives us
|
||||
// correct dispatch for free.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to get renewal policy"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Renewal policy not found"
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
// Matches the PolicyHandler.GetPolicy convention: any error from the
|
||||
// service surfaces as 404. The repo wraps sql.ErrNoRows as
|
||||
// "renewal policy not found: %s" and there's no other expected failure
|
||||
// mode on Get — the caller gets a clean 404.
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Renewal policy not found", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -179,11 +158,11 @@ func (h RenewalPolicyHandler) CreateRenewalPolicy(w http.ResponseWriter, r *http
|
||||
// UpdateRenewalPolicy replaces the fields of an existing renewal policy.
|
||||
// PUT /api/v1/renewal-policies/{id}
|
||||
//
|
||||
// Error mapping (post-M-1, sentinel-driven):
|
||||
// - invalid JSON / empty ID → 400
|
||||
// - ErrRenewalPolicyDuplicateName (pg 23505) → 409 (explicit arm, custom msg)
|
||||
// - ErrNotFound (wrapping sql.ErrNoRows) → 404 (via errToStatus)
|
||||
// - anything else → 500 (via errToStatus, body redacted)
|
||||
// Error mapping:
|
||||
// - invalid JSON / empty ID → 400
|
||||
// - ErrRenewalPolicyDuplicateName → 409
|
||||
// - error text contains "not found" → 404 (see struct doc comment re: substring check)
|
||||
// - anything else → 500
|
||||
func (h RenewalPolicyHandler) UpdateRenewalPolicy(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPut {
|
||||
Error(w, http.StatusMethodNotAllowed, "Method not allowed")
|
||||
@@ -212,17 +191,11 @@ func (h RenewalPolicyHandler) UpdateRenewalPolicy(w http.ResponseWriter, r *http
|
||||
ErrorWithRequestID(w, http.StatusConflict, "A renewal policy with that name already exists", requestID)
|
||||
return
|
||||
}
|
||||
// M-1: drop the `strings.Contains(err.Error(), "not found")` branch.
|
||||
// [repository.ErrNotFound] now wraps sql.ErrNoRows at the three
|
||||
// renewal-policy repo methods (Get/Update/Delete), so errToStatus
|
||||
// routes a missing row to 404 via errors.Is without depending on the
|
||||
// repo's fmt.Errorf format string surviving a future reword.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to update renewal policy"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Renewal policy not found"
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Renewal policy not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to update renewal policy", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -232,11 +205,11 @@ func (h RenewalPolicyHandler) UpdateRenewalPolicy(w http.ResponseWriter, r *http
|
||||
// DeleteRenewalPolicy removes a renewal policy.
|
||||
// DELETE /api/v1/renewal-policies/{id}
|
||||
//
|
||||
// Error mapping (post-M-1, sentinel-driven):
|
||||
// - empty ID (trailing slash) → 400
|
||||
// - ErrRenewalPolicyInUse (pg 23503 FK-RESTRICT) → 409 (explicit arm, custom msg)
|
||||
// - ErrNotFound (wrapping sql.ErrNoRows) → 404 (via errToStatus)
|
||||
// - anything else → 500 (via errToStatus, body redacted)
|
||||
// Error mapping:
|
||||
// - empty ID (trailing slash) → 400
|
||||
// - ErrRenewalPolicyInUse (pg 23503 FK-RESTRICT against managed_certificates.renewal_policy_id) → 409
|
||||
// - error text contains "not found" → 404
|
||||
// - anything else → 500
|
||||
func (h RenewalPolicyHandler) DeleteRenewalPolicy(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodDelete {
|
||||
Error(w, http.StatusMethodNotAllowed, "Method not allowed")
|
||||
@@ -258,14 +231,11 @@ func (h RenewalPolicyHandler) DeleteRenewalPolicy(w http.ResponseWriter, r *http
|
||||
ErrorWithRequestID(w, http.StatusConflict, "Renewal policy is still referenced by managed certificates", requestID)
|
||||
return
|
||||
}
|
||||
// M-1: sentinel dispatch replaces the substring check — see the
|
||||
// parallel comment block in UpdateRenewalPolicy for the rationale.
|
||||
status := errToStatus(err)
|
||||
msg := "Failed to delete renewal policy"
|
||||
if status == http.StatusNotFound {
|
||||
msg = "Renewal policy not found"
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
ErrorWithRequestID(w, http.StatusNotFound, "Renewal policy not found", requestID)
|
||||
return
|
||||
}
|
||||
ErrorWithRequestID(w, status, msg, requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to delete renewal policy", requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"encoding/asn1"
|
||||
"encoding/base64"
|
||||
"encoding/pem"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
@@ -15,7 +14,6 @@ import (
|
||||
"github.com/shankar0123/certctl/internal/api/middleware"
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/pkcs7"
|
||||
"github.com/shankar0123/certctl/internal/service"
|
||||
)
|
||||
|
||||
// SCEPService defines the service interface for SCEP enrollment operations.
|
||||
@@ -173,25 +171,11 @@ func (h SCEPHandler) pkiOperation(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
result, err := h.svc.PKCSReq(r.Context(), csrPEM, challengePassword, transactionID)
|
||||
if err != nil {
|
||||
// M-1 (P2): typed-sentinel dispatch replaces the pre-M-1 substring
|
||||
// branch `strings.Contains(err.Error(), "challenge password")`. The
|
||||
// service layer now wraps both challenge-password failure modes (server
|
||||
// misconfigured / client credential wrong) via `fmt.Errorf("%w: ...",
|
||||
// ErrUnauthenticated)`, so errors.Is walks the wrap chain without
|
||||
// depending on the exact wording of the error string. This also
|
||||
// corrects the HTTP status: pre-M-1 returned 403 Forbidden, but RFC
|
||||
// 7235 classifies this as 401 Unauthorized (authentication failure, not
|
||||
// authorization denial). The errToStatus doc block enumerates this as
|
||||
// the canonical ErrUnauthenticated call site.
|
||||
if errors.Is(err, service.ErrUnauthenticated) {
|
||||
ErrorWithRequestID(w, http.StatusUnauthorized, "Invalid challenge password", requestID)
|
||||
if strings.Contains(err.Error(), "challenge password") {
|
||||
ErrorWithRequestID(w, http.StatusForbidden, "Invalid challenge password", requestID)
|
||||
return
|
||||
}
|
||||
// F-002 redacted-500: every other enrollment failure (CSR parse errors,
|
||||
// issuer-layer failures, audit-layer failures) returns an opaque body
|
||||
// so we don't leak internal state through the SCEP response. The
|
||||
// logger already captured the real error at the service layer.
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Enrollment failed", requestID)
|
||||
ErrorWithRequestID(w, http.StatusInternalServerError, fmt.Sprintf("Enrollment failed: %v", err), requestID)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -4,14 +4,12 @@ import (
|
||||
"context"
|
||||
"encoding/pem"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/service"
|
||||
)
|
||||
|
||||
// mockSCEPService implements SCEPService for testing.
|
||||
@@ -216,21 +214,9 @@ func TestSCEP_PKIOperation_Success_RawCSR(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestSCEP_PKIOperation_ChallengePasswordRejected pins the M-1 (P2) dispatch
|
||||
// contract: when the service wraps the failure via `fmt.Errorf("%w: ...",
|
||||
// service.ErrUnauthenticated)` the handler's errToStatus choke point must
|
||||
// return 401 Unauthorized, NOT 403 Forbidden.
|
||||
//
|
||||
// This is a deliberate semantic correction. Pre-M-1 the handler inspected
|
||||
// err.Error() for the "challenge password" substring and returned 403, which
|
||||
// misclassified the RFC 7235 condition (the caller presented no valid
|
||||
// application-layer credential — that is auth failure, not authorization
|
||||
// denial). The errToStatus doc explicitly cites this code path as the
|
||||
// canonical ErrUnauthenticated consumer; see handler/errors.go and the
|
||||
// symmetric M-1 comment block at handler/scep.go in the pkiOperation arm.
|
||||
func TestSCEP_PKIOperation_ChallengePasswordRejected(t *testing.T) {
|
||||
svc := &mockSCEPService{
|
||||
EnrollErr: fmt.Errorf("%w: invalid challenge password", service.ErrUnauthenticated),
|
||||
EnrollErr: errors.New("invalid challenge password"),
|
||||
}
|
||||
h := NewSCEPHandler(svc)
|
||||
|
||||
@@ -244,8 +230,8 @@ func TestSCEP_PKIOperation_ChallengePasswordRejected(t *testing.T) {
|
||||
w := httptest.NewRecorder()
|
||||
h.HandleSCEP(w, req)
|
||||
|
||||
if w.Code != http.StatusUnauthorized {
|
||||
t.Errorf("expected 401 Unauthorized (M-1 dispatch of service.ErrUnauthenticated), got %d: %s", w.Code, w.Body.String())
|
||||
if w.Code != http.StatusForbidden {
|
||||
t.Errorf("expected 403, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,27 +1,11 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
import "errors"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/service"
|
||||
)
|
||||
|
||||
// Mock errors for testing.
|
||||
//
|
||||
// M-1: Since the handler layer now classifies errors via the typed-sentinel
|
||||
// dispatch in [errToStatus] (errors.Is on service + repository sentinels rather
|
||||
// than substring matching on err.Error()), handler mocks MUST wrap the
|
||||
// appropriate generic sentinel so `errors.Is(err, service.ErrNotFound)` etc.
|
||||
// succeed. Using raw errors.New() breaks the dispatch and degrades every
|
||||
// mock-driven negative-path test to a 500 Internal Server Error — the same
|
||||
// silent-regression trap the migration was designed to eliminate.
|
||||
//
|
||||
// ErrMockServiceFailed deliberately stays untyped so it continues to exercise
|
||||
// the default 500 path.
|
||||
var (
|
||||
// Mock errors for testing
|
||||
ErrMockServiceFailed = errors.New("mock service error")
|
||||
ErrMockNotFound = fmt.Errorf("%w: mock not found", service.ErrNotFound)
|
||||
ErrMockUnauthorized = fmt.Errorf("%w: mock unauthenticated", service.ErrUnauthenticated)
|
||||
ErrMockConflict = fmt.Errorf("%w: mock conflict", service.ErrConflict)
|
||||
ErrMockNotFound = errors.New("mock not found error")
|
||||
ErrMockUnauthorized = errors.New("mock unauthorized error")
|
||||
ErrMockConflict = errors.New("mock conflict error")
|
||||
)
|
||||
|
||||
@@ -117,8 +117,14 @@ func HashAPIKey(key string) string {
|
||||
}
|
||||
|
||||
// AuthConfig holds configuration for the Auth middleware.
|
||||
//
|
||||
// G-1 (P1): valid Type values are "api-key" or "none" only. "jwt" was
|
||||
// removed because no JWT middleware ships with certctl (silent auth
|
||||
// downgrade pre-G-1). The single source of truth for the allowed set
|
||||
// lives at internal/config.AuthType / config.ValidAuthTypes() — prefer
|
||||
// those constants over string literals when comparing.
|
||||
type AuthConfig struct {
|
||||
Type string // "api-key", "jwt", "none"
|
||||
Type string // "api-key" or "none" (see config.AuthType constants)
|
||||
Secret string // The raw API key or comma-separated list of valid API keys
|
||||
}
|
||||
|
||||
|
||||
+80
-12
@@ -802,13 +802,59 @@ type NamedAPIKey struct {
|
||||
Admin bool
|
||||
}
|
||||
|
||||
// AuthType is the discriminator for the API auth middleware shape. The
|
||||
// string alias preserves env-var roundtrip (the value flows through getEnv
|
||||
// as a plain string) while giving us a typed surface for switches and
|
||||
// validation. Use the named constants below rather than string literals
|
||||
// so future enum additions/removals are caught at compile time.
|
||||
//
|
||||
// G-1 (P1): the pre-G-1 validAuthTypes map literal accepted "jwt" with no
|
||||
// JWT middleware behind it (silent auth downgrade — the configured type
|
||||
// was logged as "jwt" but every request routed through the api-key bearer
|
||||
// middleware regardless). Operators who set CERTCTL_AUTH_TYPE=jwt thought
|
||||
// they had JWT auth; they didn't. The typed alias + ValidAuthTypes()
|
||||
// helper make the allowed set the single source of truth across config
|
||||
// validation, the runtime defense-in-depth switch in main.go, and the
|
||||
// helm-chart template guard (`certctl.validateAuthType`).
|
||||
type AuthType string
|
||||
|
||||
const (
|
||||
// AuthTypeAPIKey routes requests through the api-key bearer middleware.
|
||||
// CERTCTL_AUTH_SECRET (or CERTCTL_API_KEYS_NAMED) is required.
|
||||
AuthTypeAPIKey AuthType = "api-key"
|
||||
|
||||
// AuthTypeNone disables authentication entirely. Development only —
|
||||
// the server logs a loud Warn at startup. Operators who need
|
||||
// JWT/OIDC/mTLS run an authenticating gateway (oauth2-proxy / Envoy
|
||||
// ext_authz / Traefik ForwardAuth / Pomerium) in front of certctl
|
||||
// and set this value on the upstream certctl process. See
|
||||
// docs/architecture.md "Authenticating-gateway pattern".
|
||||
AuthTypeNone AuthType = "none"
|
||||
)
|
||||
|
||||
// ValidAuthTypes returns the allowed CERTCTL_AUTH_TYPE values. The set is
|
||||
// intentionally narrow — JWT was accepted pre-G-1 with no middleware
|
||||
// implementation behind it. Single source of truth referenced by the
|
||||
// validator below, the runtime guard in cmd/server/main.go, the helm
|
||||
// chart template (`certctl.validateAuthType`), and the property test in
|
||||
// config_test.go that pins "jwt" out of the slice forever.
|
||||
func ValidAuthTypes() []AuthType {
|
||||
return []AuthType{AuthTypeAPIKey, AuthTypeNone}
|
||||
}
|
||||
|
||||
// AuthConfig contains authentication configuration.
|
||||
type AuthConfig struct {
|
||||
// Type sets the authentication mechanism for the REST API.
|
||||
// Valid values: "api-key" (default, production), "jwt", "none" (development only).
|
||||
// When "api-key", clients must provide Authorization: Bearer <key> header.
|
||||
// "none" requires explicit opt-in via CERTCTL_AUTH_TYPE env var with warning logged.
|
||||
// Valid values: "api-key" (default, production) and "none" (development
|
||||
// only — disables authentication on the API and logs a loud Warn at
|
||||
// startup). For JWT/OIDC, run an authenticating gateway (oauth2-proxy /
|
||||
// Envoy / Traefik ForwardAuth / Pomerium) in front of certctl and set
|
||||
// CERTCTL_AUTH_TYPE=none on the upstream — see docs/architecture.md
|
||||
// "Authenticating-gateway pattern" and docs/upgrade-to-v2-jwt-removal.md.
|
||||
// Setting: CERTCTL_AUTH_TYPE environment variable. Default: "api-key".
|
||||
// Use the AuthType constants (AuthTypeAPIKey / AuthTypeNone) for typed
|
||||
// comparisons; the field stays `string` to preserve env-var roundtrip
|
||||
// shape used by getEnv() and downstream Helm/compose interpolation.
|
||||
Type string
|
||||
|
||||
// Secret is the legacy authentication secret (comma-separated API keys).
|
||||
@@ -1148,18 +1194,40 @@ func (c *Config) Validate() error {
|
||||
return fmt.Errorf("invalid log format: %s", c.Log.Format)
|
||||
}
|
||||
|
||||
// Validate auth type
|
||||
validAuthTypes := map[string]bool{
|
||||
"api-key": true,
|
||||
"jwt": true,
|
||||
"none": true,
|
||||
// Validate auth type.
|
||||
//
|
||||
// G-1 (P1): the pre-G-1 set was {"api-key", "jwt", "none"} with "jwt"
|
||||
// accepted but no JWT middleware shipped — silent auth downgrade.
|
||||
// Post-G-1 we route a literal "jwt" value through a dedicated
|
||||
// rejection that gives operators actionable guidance (the
|
||||
// authenticating-gateway pattern) instead of the generic
|
||||
// "invalid auth type". Then we cross-check against ValidAuthTypes()
|
||||
// so any value outside {api-key, none} surfaces uniformly.
|
||||
if c.Auth.Type == "jwt" {
|
||||
return fmt.Errorf(
|
||||
"CERTCTL_AUTH_TYPE=jwt is no longer accepted (G-1 silent auth " +
|
||||
"downgrade): no JWT middleware ships with certctl. To use " +
|
||||
"JWT/OIDC, run an authenticating gateway (oauth2-proxy / " +
|
||||
"Envoy ext_authz / Traefik ForwardAuth / Pomerium) in " +
|
||||
"front of certctl and set CERTCTL_AUTH_TYPE=none on the " +
|
||||
"upstream. See docs/architecture.md \"Authenticating-" +
|
||||
"gateway pattern\" and docs/upgrade-to-v2-jwt-removal.md " +
|
||||
"for the migration walkthrough")
|
||||
}
|
||||
if !validAuthTypes[c.Auth.Type] {
|
||||
return fmt.Errorf("invalid auth type: %s", c.Auth.Type)
|
||||
authTypeValid := false
|
||||
for _, t := range ValidAuthTypes() {
|
||||
if AuthType(c.Auth.Type) == t {
|
||||
authTypeValid = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !authTypeValid {
|
||||
return fmt.Errorf("invalid auth type: %s (valid: %v)", c.Auth.Type, ValidAuthTypes())
|
||||
}
|
||||
|
||||
// If using JWT or API-key, secret is required
|
||||
if (c.Auth.Type == "jwt" || c.Auth.Type == "api-key") && c.Auth.Secret == "" {
|
||||
// If using API-key, secret is required. (Secret was previously also
|
||||
// required for "jwt"; removed with the jwt rejection above.)
|
||||
if c.Auth.Type == string(AuthTypeAPIKey) && c.Auth.Secret == "" {
|
||||
return fmt.Errorf("auth secret is required for auth type %s", c.Auth.Type)
|
||||
}
|
||||
|
||||
|
||||
+149
-15
@@ -458,6 +458,8 @@ func TestValidate_InvalidAuthType(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -477,6 +479,8 @@ func TestValidate_APIKeyAuth_MissingSecret(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -484,25 +488,133 @@ func TestValidate_APIKeyAuth_MissingSecret(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidate_JWTAuth_MissingSecret(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: validServerConfig(t),
|
||||
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||
Log: LogConfig{Level: "info", Format: "json"},
|
||||
Auth: AuthConfig{Type: "jwt", Secret: ""},
|
||||
Keygen: KeygenConfig{Mode: "agent"},
|
||||
Scheduler: SchedulerConfig{
|
||||
RenewalCheckInterval: 1 * time.Hour,
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
},
|
||||
// TestValidate_JWTAuth_RejectedDedicated locks down the G-1 fix: pre-G-1
|
||||
// `CERTCTL_AUTH_TYPE=jwt` was accepted by the validator (the bare error
|
||||
// path was the empty-secret one previously). Post-G-1 the literal "jwt"
|
||||
// value is rejected with a dedicated diagnostic regardless of whether
|
||||
// Secret is set, because there is no JWT middleware in the binary —
|
||||
// operators who need JWT/OIDC must front certctl with an authenticating
|
||||
// gateway.
|
||||
//
|
||||
// Two table rows pin the contract: missing-secret cannot paper over the
|
||||
// rejection (the dedicated error fires first, before the secret check),
|
||||
// and a populated secret also cannot paper over it. Both paths must
|
||||
// hit the dedicated G-1 diagnostic, not the generic "invalid auth
|
||||
// type" or "auth secret is required".
|
||||
func TestValidate_JWTAuth_RejectedDedicated(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := []struct {
|
||||
name string
|
||||
secret string
|
||||
}{
|
||||
{"jwt rejected (no secret)", ""},
|
||||
{"jwt rejected (with secret — operator can't paper over)", "anything"},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
t.Error("Validate() should return error when jwt auth has empty secret")
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: validServerConfig(t),
|
||||
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||
Log: LogConfig{Level: "info", Format: "json"},
|
||||
Auth: AuthConfig{Type: "jwt", Secret: tc.secret},
|
||||
Keygen: KeygenConfig{Mode: "agent"},
|
||||
Scheduler: SchedulerConfig{
|
||||
RenewalCheckInterval: 1 * time.Hour,
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
if err == nil {
|
||||
t.Fatal("Validate() returned nil; expected dedicated G-1 rejection")
|
||||
}
|
||||
const wantSubstr = "CERTCTL_AUTH_TYPE=jwt is no longer accepted"
|
||||
if !strings.Contains(err.Error(), wantSubstr) {
|
||||
t.Errorf("Validate() = %v\nwant substring %q (the dedicated G-1 diagnostic)", err, wantSubstr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidAuthTypesDoesNotContainJWT is a property-level guard against
|
||||
// a future PR silently re-introducing "jwt" into the allowed set. If
|
||||
// someone adds JWT back to ValidAuthTypes(), this test fails immediately
|
||||
// with a pointer at the audit finding. The matching CI grep guardrail
|
||||
// in .github/workflows/ci.yml provides a secondary check at build time.
|
||||
func TestValidAuthTypesDoesNotContainJWT(t *testing.T) {
|
||||
t.Parallel()
|
||||
for _, at := range ValidAuthTypes() {
|
||||
if at == "jwt" {
|
||||
t.Fatalf("jwt is in ValidAuthTypes — silent auth downgrade regressed (G-1)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidAuthTypesIsExactly_APIKey_None pins the current allowed set.
|
||||
// If a future change adds a new auth type, this test must be updated
|
||||
// alongside the validator and the helm-chart `validateAuthType` helper —
|
||||
// keeping all three surfaces in sync.
|
||||
func TestValidAuthTypesIsExactly_APIKey_None(t *testing.T) {
|
||||
t.Parallel()
|
||||
got := ValidAuthTypes()
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("ValidAuthTypes() returned %d entries, want 2: %v", len(got), got)
|
||||
}
|
||||
want := map[AuthType]bool{AuthTypeAPIKey: true, AuthTypeNone: true}
|
||||
for _, at := range got {
|
||||
if !want[at] {
|
||||
t.Errorf("unexpected auth type in ValidAuthTypes: %q", at)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidate_GenericInvalidAuthType ensures that values outside the
|
||||
// allowed set (other than the special-cased "jwt") still surface the
|
||||
// generic "invalid auth type" error. Pins that the dedicated G-1
|
||||
// rejection didn't accidentally swallow non-jwt typos.
|
||||
func TestValidate_GenericInvalidAuthType(t *testing.T) {
|
||||
t.Parallel()
|
||||
for _, badType := range []string{"", "garbage", "oidc", "mtls", "API-KEY"} {
|
||||
t.Run("type="+badType, func(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: validServerConfig(t),
|
||||
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||
Log: LogConfig{Level: "info", Format: "json"},
|
||||
Auth: AuthConfig{Type: badType, Secret: "x"},
|
||||
Keygen: KeygenConfig{Mode: "agent"},
|
||||
Scheduler: SchedulerConfig{
|
||||
RenewalCheckInterval: 1 * time.Hour,
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
if err == nil {
|
||||
t.Fatalf("Validate(type=%q) returned nil; expected invalid-auth-type rejection", badType)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "invalid auth type") {
|
||||
t.Errorf("Validate(type=%q) = %v; want \"invalid auth type\" error", badType, err)
|
||||
}
|
||||
if strings.Contains(err.Error(), "G-1 silent auth") {
|
||||
t.Errorf("Validate(type=%q) = %v; should not hit the dedicated G-1 path for non-jwt values", badType, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// G-1 (P1): no need to add `TestValidate_NoneAuth_AcceptsEmptySecret` or
|
||||
// `TestValidate_APIKeyAuth_RequiresSecret` here — the pre-existing tests
|
||||
// `TestValidate_AuthTypeNone` (above) and `TestValidate_APIKeyAuth_MissingSecret`
|
||||
// (above) already cover those paths. Documented for the next reader: the
|
||||
// G-1 fix flipped jwt off but did not disturb either the
|
||||
// none-bypasses-secret or the api-key-requires-secret behavior.
|
||||
|
||||
func TestValidate_InvalidKeygenMode(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: validServerConfig(t),
|
||||
@@ -515,6 +627,8 @@ func TestValidate_InvalidKeygenMode(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -544,6 +658,8 @@ func TestValidate_InvalidPort(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -574,6 +690,8 @@ func TestValidate_TLSCertPathEmpty(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -605,6 +723,8 @@ func TestValidate_TLSKeyPathEmpty(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -637,6 +757,8 @@ func TestValidate_TLSCertFileMissing(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -668,6 +790,8 @@ func TestValidate_TLSKeyFileMissing(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -701,6 +825,8 @@ func TestValidate_TLSMismatchedPair(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
err := cfg.Validate()
|
||||
@@ -724,6 +850,8 @@ func TestValidate_EmptyDatabaseURL(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -743,6 +871,8 @@ func TestValidate_InvalidLogLevel(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -762,6 +892,8 @@ func TestValidate_InvalidLogFormat(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
@@ -840,6 +972,8 @@ func TestValidate_DatabaseMaxConnectionsZero(t *testing.T) {
|
||||
JobProcessorInterval: 30 * time.Second,
|
||||
AgentHealthCheckInterval: 2 * time.Minute,
|
||||
NotificationProcessInterval: 1 * time.Minute,
|
||||
NotificationRetryInterval: 2 * time.Minute,
|
||||
RetryInterval: 5 * time.Minute,
|
||||
},
|
||||
}
|
||||
if err := cfg.Validate(); err == nil {
|
||||
|
||||
@@ -46,7 +46,26 @@ type Agent struct {
|
||||
Status AgentStatus `json:"status"`
|
||||
LastHeartbeatAt *time.Time `json:"last_heartbeat_at,omitempty"`
|
||||
RegisteredAt time.Time `json:"registered_at"`
|
||||
APIKeyHash string `json:"api_key_hash"`
|
||||
// APIKeyHash is the SHA-256 of the agent's plaintext API key,
|
||||
// populated by service.RegisterAgent (`hashAPIKey(apiKey)`) and
|
||||
// consumed by repository.AgentRepository::GetByAPIKey at auth time.
|
||||
// It is server-internal: never serialized to clients, never echoed
|
||||
// via CLI / MCP / agent registration response, never logged.
|
||||
//
|
||||
// G-2 (P1): pre-G-2 the field was tagged `json:"api_key_hash"` and
|
||||
// shipped on every /api/v1/agents response (cat-s5-apikey_leak). Even
|
||||
// SHA-256 should not be shipped to clients — it gives an offline
|
||||
// brute-force target if API-key entropy is low (certctl doesn't enforce
|
||||
// a minimum on operator-supplied keys), and there is no business reason
|
||||
// for any client to ever receive it. Post-G-2 the JSON tag is "-" and
|
||||
// Agent.MarshalJSON below zeroes the field on a copy before delegating
|
||||
// to the default marshal — defense in depth so a future tag-revert by
|
||||
// refactor cannot reopen the leak. The DB column, repo SELECT/INSERT/
|
||||
// UPDATE paths, and service-side hashing are unchanged. See
|
||||
// docs/architecture.md ER diagram (which documents DB shape, not API
|
||||
// shape) and coverage-gap-audit-2026-04-24-v5/unified-audit.md
|
||||
// cat-s5-apikey_leak for the full closure rationale.
|
||||
APIKeyHash string `json:"-"`
|
||||
OS string `json:"os"`
|
||||
Architecture string `json:"architecture"`
|
||||
IPAddress string `json:"ip_address"`
|
||||
@@ -60,6 +79,31 @@ type Agent struct {
|
||||
RetiredReason *string `json:"retired_reason,omitempty"`
|
||||
}
|
||||
|
||||
// MarshalJSON implements json.Marshaler. It explicitly zeros APIKeyHash
|
||||
// before serialization to defense-in-depth the `json:"-"` tag above.
|
||||
//
|
||||
// G-2 (P1): pre-G-2 the field was tagged `json:"api_key_hash"` and
|
||||
// shipped on every /api/v1/agents response (cat-s5-apikey_leak). Post-G-2
|
||||
// the tag is "-" and this method enforces redaction even if the tag is
|
||||
// reverted by a future refactor — the receiver is by-value so the
|
||||
// APIKeyHash = "" assignment mutates only the marshal-time copy, never
|
||||
// the caller's original. The type-alias trick (`type alias Agent`)
|
||||
// breaks the recursive MarshalJSON call that would otherwise stack-
|
||||
// overflow. Both *Agent and Agent receivers route through here because
|
||||
// the json package looks the method up via reflect.Value, and a value
|
||||
// receiver satisfies both kinds of pointer.
|
||||
//
|
||||
// Auditor's note for the next reader: do NOT remove this method even if
|
||||
// the json:"-" tag stays. The CI guardrail at .github/workflows/ci.yml
|
||||
// also blocks reintroduction at the tag site, but this method is the
|
||||
// last line of defense for serialization paths that bypass struct tags
|
||||
// (e.g., a future MarshalJSON on a parent struct that embeds Agent).
|
||||
func (a Agent) MarshalJSON() ([]byte, error) {
|
||||
type alias Agent // breaks recursion: alias has no MarshalJSON method
|
||||
a.APIKeyHash = ""
|
||||
return json.Marshal(alias(a))
|
||||
}
|
||||
|
||||
// IsRetired returns true when this agent has been soft-retired.
|
||||
// I-004: callers that iterate active agents (stats dashboard, stale-offline
|
||||
// sweeper, handler-facing list) must skip retired rows by default.
|
||||
|
||||
@@ -1,10 +1,22 @@
|
||||
package domain
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// jsonMarshalDirect / jsonUnmarshalDirect are thin aliases for the
|
||||
// stdlib encoding/json calls. They exist only to make the G-2 redaction
|
||||
// tests below grep-friendly: any future test author searching for "how
|
||||
// is APIKeyHash redaction tested" will land on these and the call sites,
|
||||
// rather than having to grep through dozens of unrelated json.Marshal
|
||||
// usages.
|
||||
func jsonMarshalDirect(v interface{}) ([]byte, error) { return json.Marshal(v) }
|
||||
func jsonUnmarshalDirect(data []byte, v interface{}) error { return json.Unmarshal(data, v) }
|
||||
func containsSubstr(haystack, needle string) bool { return strings.Contains(haystack, needle) }
|
||||
|
||||
// TestAgent_IsRetired covers the I-004 soft-retirement predicate that gates
|
||||
// which callers hide an agent row from active listings.
|
||||
func TestAgent_IsRetired(t *testing.T) {
|
||||
@@ -53,3 +65,172 @@ func TestAgentDependencyCounts_HasDependencies(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// G-2 (P1): the cat-s5-apikey_leak audit closure tests. Pre-G-2,
|
||||
// Agent.APIKeyHash was tagged `json:"api_key_hash"` and shipped on
|
||||
// every /api/v1/agents response — credential-derivative leak that gave
|
||||
// offline brute-force targets to every authenticated client. Post-G-2
|
||||
// the tag is "-" AND Agent.MarshalJSON zeroes the field on a marshal-
|
||||
// time copy. These tests pin both layers of the defense:
|
||||
//
|
||||
// 1. A populated APIKeyHash is never present in the marshaled JSON.
|
||||
// 2. The redaction holds on *Agent, on slice elements, and on a
|
||||
// sentinel literal-value check (so even a future field that
|
||||
// happens to contain the same hash string would not appear).
|
||||
// 3. The marshal-time copy does not mutate the caller's original —
|
||||
// receiver is by-value, but pin it explicitly so a future refactor
|
||||
// that switches to pointer-receiver gets caught.
|
||||
// 4. Round-trip preserves every other field (hash dropped on encode,
|
||||
// cannot reappear on decode because the wire never carries it).
|
||||
|
||||
const g2LeakSentinel = "sha256:LEAKED-CREDENTIAL-DERIVATIVE-SENTINEL"
|
||||
|
||||
// TestAgent_MarshalJSON_RedactsAPIKeyHash is the marshal-boundary
|
||||
// contract test: a single Agent value with a populated APIKeyHash must
|
||||
// not emit the field name nor the sentinel value.
|
||||
func TestAgent_MarshalJSON_RedactsAPIKeyHash(t *testing.T) {
|
||||
t.Parallel()
|
||||
a := Agent{
|
||||
ID: "agent-test",
|
||||
Name: "test-agent",
|
||||
Hostname: "host.example",
|
||||
Status: AgentStatusOnline,
|
||||
RegisteredAt: time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
APIKeyHash: g2LeakSentinel,
|
||||
OS: "linux",
|
||||
Architecture: "amd64",
|
||||
IPAddress: "10.0.0.1",
|
||||
Version: "v2.0.49",
|
||||
}
|
||||
out, err := jsonMarshalDirect(a)
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal returned error: %v", err)
|
||||
}
|
||||
body := string(out)
|
||||
if containsSubstr(body, "api_key_hash") {
|
||||
t.Errorf("marshaled body contains \"api_key_hash\" key — G-2 leak regressed:\n%s", body)
|
||||
}
|
||||
if containsSubstr(body, "APIKeyHash") {
|
||||
t.Errorf("marshaled body contains \"APIKeyHash\" — type-alias redaction broke:\n%s", body)
|
||||
}
|
||||
if containsSubstr(body, g2LeakSentinel) {
|
||||
t.Errorf("marshaled body contains the leak sentinel %q — value redaction broke:\n%s", g2LeakSentinel, body)
|
||||
}
|
||||
// Sanity: every OTHER non-zero field IS present (this guards against
|
||||
// the type-alias trick accidentally dropping siblings).
|
||||
for _, want := range []string{"agent-test", "test-agent", "host.example", "Online", "linux", "amd64", "10.0.0.1", "v2.0.49"} {
|
||||
if !containsSubstr(body, want) {
|
||||
t.Errorf("marshaled body missing expected field value %q:\n%s", want, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgent_MarshalJSON_RedactsViaPointer covers the *Agent path that
|
||||
// handlers hit when calling JSON(w, http.StatusOK, agent) with a *Agent
|
||||
// from svc.GetAgent. A value-receiver MarshalJSON is reachable from
|
||||
// pointer values via reflect; this test pins that contract.
|
||||
func TestAgent_MarshalJSON_RedactsViaPointer(t *testing.T) {
|
||||
t.Parallel()
|
||||
a := &Agent{ID: "agent-x", APIKeyHash: g2LeakSentinel}
|
||||
out, err := jsonMarshalDirect(a)
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal *Agent returned error: %v", err)
|
||||
}
|
||||
if containsSubstr(string(out), g2LeakSentinel) {
|
||||
t.Errorf("*Agent marshal leaked sentinel:\n%s", string(out))
|
||||
}
|
||||
if containsSubstr(string(out), "api_key_hash") {
|
||||
t.Errorf("*Agent marshal contains \"api_key_hash\" key:\n%s", string(out))
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgent_MarshalJSON_RedactsInSlice covers the []domain.Agent path
|
||||
// the ListAgents handler emits via PagedResponse{Data: agents}. Each
|
||||
// element must be redacted independently.
|
||||
func TestAgent_MarshalJSON_RedactsInSlice(t *testing.T) {
|
||||
t.Parallel()
|
||||
agents := []Agent{
|
||||
{ID: "agent-1", APIKeyHash: g2LeakSentinel + "-1"},
|
||||
{ID: "agent-2", APIKeyHash: g2LeakSentinel + "-2"},
|
||||
{ID: "agent-3", APIKeyHash: g2LeakSentinel + "-3"},
|
||||
}
|
||||
out, err := jsonMarshalDirect(agents)
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal []Agent returned error: %v", err)
|
||||
}
|
||||
body := string(out)
|
||||
if containsSubstr(body, "api_key_hash") {
|
||||
t.Errorf("[]Agent marshal contains \"api_key_hash\" key:\n%s", body)
|
||||
}
|
||||
for i := 1; i <= 3; i++ {
|
||||
sentinel := g2LeakSentinel + "-" + string(rune('0'+i))
|
||||
if containsSubstr(body, sentinel) {
|
||||
t.Errorf("[]Agent marshal leaked sentinel %q:\n%s", sentinel, body)
|
||||
}
|
||||
}
|
||||
// Every agent ID is present — the redaction didn't accidentally
|
||||
// strip the entire element.
|
||||
for _, id := range []string{"agent-1", "agent-2", "agent-3"} {
|
||||
if !containsSubstr(body, id) {
|
||||
t.Errorf("[]Agent marshal missing element ID %q:\n%s", id, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgent_MarshalJSON_DoesNotMutateReceiver pins the by-value-receiver
|
||||
// contract: marshaling must not zero APIKeyHash on the caller's struct,
|
||||
// only on the marshal-time copy. This guards against a future refactor
|
||||
// that switches to pointer receiver and breaks every code path that
|
||||
// marshals an Agent and then re-uses it (e.g., audit-event payload
|
||||
// construction immediately after returning the agent in a handler).
|
||||
func TestAgent_MarshalJSON_DoesNotMutateReceiver(t *testing.T) {
|
||||
t.Parallel()
|
||||
a := Agent{ID: "agent-keep", APIKeyHash: g2LeakSentinel}
|
||||
if _, err := jsonMarshalDirect(a); err != nil {
|
||||
t.Fatalf("Marshal returned error: %v", err)
|
||||
}
|
||||
if a.APIKeyHash != g2LeakSentinel {
|
||||
t.Errorf("MarshalJSON mutated caller's APIKeyHash: got %q want %q", a.APIKeyHash, g2LeakSentinel)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAgent_MarshalJSON_RoundTrip pins the wire-shape contract: an
|
||||
// Agent marshaled to JSON and unmarshaled back into a fresh Agent has
|
||||
// every field preserved EXCEPT APIKeyHash, which the wire never carries.
|
||||
// This double-confirms the redaction is a one-way guarantee at the
|
||||
// serialization boundary, not an accidental on-decode behavior.
|
||||
func TestAgent_MarshalJSON_RoundTrip(t *testing.T) {
|
||||
t.Parallel()
|
||||
now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC)
|
||||
hb := now.Add(-5 * time.Minute)
|
||||
original := Agent{
|
||||
ID: "agent-rt",
|
||||
Name: "rt",
|
||||
Hostname: "rt.host",
|
||||
Status: AgentStatusOnline,
|
||||
LastHeartbeatAt: &hb,
|
||||
RegisteredAt: now,
|
||||
APIKeyHash: g2LeakSentinel,
|
||||
OS: "linux",
|
||||
Architecture: "arm64",
|
||||
IPAddress: "10.0.0.99",
|
||||
Version: "v2.0.49",
|
||||
}
|
||||
out, err := jsonMarshalDirect(original)
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal returned error: %v", err)
|
||||
}
|
||||
var got Agent
|
||||
if err := jsonUnmarshalDirect(out, &got); err != nil {
|
||||
t.Fatalf("Unmarshal returned error: %v", err)
|
||||
}
|
||||
if got.APIKeyHash != "" {
|
||||
t.Errorf("APIKeyHash survived round-trip: got %q want empty (the wire must not carry it)", got.APIKeyHash)
|
||||
}
|
||||
if got.ID != original.ID || got.Name != original.Name || got.Hostname != original.Hostname {
|
||||
t.Errorf("identity fields lost in round-trip: got %+v want %+v", got, original)
|
||||
}
|
||||
if got.OS != original.OS || got.Architecture != original.Architecture || got.IPAddress != original.IPAddress {
|
||||
t.Errorf("metadata fields lost in round-trip: got %+v want %+v", got, original)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1318,10 +1318,7 @@ func (m *mockProfileService) ListProfiles(_ context.Context, page, perPage int)
|
||||
}
|
||||
|
||||
func (m *mockProfileService) GetProfile(_ context.Context, id string) (*domain.CertificateProfile, error) {
|
||||
// M-1: wrap service.ErrNotFound so the handler's errToStatus choke point
|
||||
// routes this to 404 via errors.Is. The Error() message still contains
|
||||
// "not found" so any pre-migration substring assertions continue to pass.
|
||||
return nil, fmt.Errorf("%w: profile not found", service.ErrNotFound)
|
||||
return nil, fmt.Errorf("profile not found")
|
||||
}
|
||||
|
||||
func (m *mockProfileService) CreateProfile(_ context.Context, profile domain.CertificateProfile) (*domain.CertificateProfile, error) {
|
||||
@@ -1344,8 +1341,7 @@ func (m *mockAgentGroupService) ListAgentGroups(_ context.Context, page, perPage
|
||||
}
|
||||
|
||||
func (m *mockAgentGroupService) GetAgentGroup(_ context.Context, id string) (*domain.AgentGroup, error) {
|
||||
// M-1: wrap service.ErrNotFound — see GetProfile above for rationale.
|
||||
return nil, fmt.Errorf("%w: agent group not found", service.ErrNotFound)
|
||||
return nil, fmt.Errorf("agent group not found")
|
||||
}
|
||||
|
||||
func (m *mockAgentGroupService) CreateAgentGroup(_ context.Context, group domain.AgentGroup) (*domain.AgentGroup, error) {
|
||||
|
||||
@@ -18,17 +18,7 @@ import (
|
||||
// managed_certificates.renewal_policy_id to renewal_policies.id with ON
|
||||
// DELETE RESTRICT). Both map onto the same 409 status but with distinct
|
||||
// messages so operators can tell them apart.
|
||||
//
|
||||
// M-1: ErrNotFound is the repo-layer "row not found" sentinel. Repositories
|
||||
// that historically returned fmt.Errorf("... not found: %s", id) without
|
||||
// wrapping sql.ErrNoRows now wrap ErrNotFound via fmt.Errorf("%w: ...", so
|
||||
// the handler layer's single errToStatus choke point can route them to HTTP
|
||||
// 404 via errors.Is without substring-matching the message text. Existing
|
||||
// service-level service.ErrNotFound stays a distinct value — both map to 404
|
||||
// through explicit branches in handler/errors.go (mirrors the G-1 treatment
|
||||
// of the repo-level 409 sentinels).
|
||||
var (
|
||||
ErrNotFound = errors.New("repository: not found")
|
||||
ErrRenewalPolicyDuplicateName = errors.New("renewal policy name already exists")
|
||||
ErrRenewalPolicyInUse = errors.New("renewal policy is still referenced by managed certificates")
|
||||
)
|
||||
|
||||
@@ -3,13 +3,11 @@ package postgres
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/repository"
|
||||
)
|
||||
|
||||
// AgentRepository implements repository.AgentRepository
|
||||
@@ -74,13 +72,8 @@ func (r *AgentRepository) Get(ctx context.Context, id string) (*domain.Agent, er
|
||||
|
||||
agent, err := scanAgent(row)
|
||||
if err != nil {
|
||||
// M-1 (P2): wrap sql.ErrNoRows with repository.ErrNotFound via %w so
|
||||
// the handler's errToStatus choke point dispatches to 404 via
|
||||
// errors.Is instead of the pre-M-1 strings.Contains(err.Error(),
|
||||
// "not found") branch at handler/agents.go. Mirrors agent_group and
|
||||
// profile repositories.
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, fmt.Errorf("%w: agent %s", repository.ErrNotFound, id)
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, fmt.Errorf("agent not found")
|
||||
}
|
||||
return nil, fmt.Errorf("failed to query agent: %w", err)
|
||||
}
|
||||
@@ -176,12 +169,8 @@ func (r *AgentRepository) Update(ctx context.Context, agent *domain.Agent) error
|
||||
return fmt.Errorf("failed to get rows affected: %w", err)
|
||||
}
|
||||
|
||||
// M-1 (P2): wrap the zero-rows-affected condition with
|
||||
// repository.ErrNotFound so the handler's errToStatus dispatches to 404
|
||||
// via errors.Is without substring matching. Mirrors agent_group and
|
||||
// profile repositories.
|
||||
if rows == 0 {
|
||||
return fmt.Errorf("%w: agent %s", repository.ErrNotFound, agent.ID)
|
||||
return fmt.Errorf("agent not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -200,10 +189,8 @@ func (r *AgentRepository) Delete(ctx context.Context, id string) error {
|
||||
return fmt.Errorf("failed to get rows affected: %w", err)
|
||||
}
|
||||
|
||||
// M-1 (P2): zero-rows-affected → repository.ErrNotFound wrap. Mirrors
|
||||
// agent_group and profile repositories.
|
||||
if rows == 0 {
|
||||
return fmt.Errorf("%w: agent %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("agent not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -249,13 +236,8 @@ func (r *AgentRepository) UpdateHeartbeat(ctx context.Context, id string, metada
|
||||
return fmt.Errorf("failed to get rows affected: %w", err)
|
||||
}
|
||||
|
||||
// M-1 (P2): zero-rows-affected → repository.ErrNotFound wrap. Note the
|
||||
// UPDATE filters on `retired_at IS NULL`, so a retired agent row also
|
||||
// returns zero-rows-affected here. The service layer short-circuits with
|
||||
// ErrAgentRetired (410) BEFORE reaching this path via Heartbeat's
|
||||
// explicit Get check, so the 404 vs 410 distinction is drawn upstream.
|
||||
if rows == 0 {
|
||||
return fmt.Errorf("%w: agent %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("agent not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -276,13 +258,8 @@ func (r *AgentRepository) GetByAPIKey(ctx context.Context, keyHash string) (*dom
|
||||
|
||||
agent, err := scanAgent(row)
|
||||
if err != nil {
|
||||
// M-1 (P2): wrap sql.ErrNoRows with repository.ErrNotFound via %w.
|
||||
// The auth middleware calls this on every request; a missing row
|
||||
// must surface as 404 (well, 401 upstream — the middleware treats
|
||||
// "no agent matched" as auth failure) via the errToStatus choke
|
||||
// point, not via substring matching.
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, fmt.Errorf("%w: agent with api key not found", repository.ErrNotFound)
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, fmt.Errorf("agent not found")
|
||||
}
|
||||
return nil, fmt.Errorf("failed to query agent: %w", err)
|
||||
}
|
||||
|
||||
@@ -3,12 +3,10 @@ package postgres
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/repository"
|
||||
)
|
||||
|
||||
// AgentGroupRepository implements agent group CRUD with PostgreSQL.
|
||||
@@ -51,12 +49,8 @@ func (r *AgentGroupRepository) Get(ctx context.Context, id string) (*domain.Agen
|
||||
g := &domain.AgentGroup{}
|
||||
err := row.Scan(&g.ID, &g.Name, &g.Description, &g.MatchOS, &g.MatchArchitecture,
|
||||
&g.MatchIPCIDR, &g.MatchVersion, &g.Enabled, &g.CreatedAt, &g.UpdatedAt)
|
||||
// M-1 (P2): wrap sql.ErrNoRows with repository.ErrNotFound via %w so the
|
||||
// handler's errToStatus choke point dispatches to 404 via errors.Is
|
||||
// instead of the pre-M-1 strings.Contains(err.Error(), "not found")
|
||||
// branch at handler/agent_groups.go. Mirrors profile repository.
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, fmt.Errorf("%w: agent group %s", repository.ErrNotFound, id)
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, fmt.Errorf("agent group not found: %s", id)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get agent group: %w", err)
|
||||
@@ -89,11 +83,8 @@ func (r *AgentGroupRepository) Update(ctx context.Context, group *domain.AgentGr
|
||||
return fmt.Errorf("failed to update agent group: %w", err)
|
||||
}
|
||||
rows, _ := result.RowsAffected()
|
||||
// M-1 (P2): wrap the zero-rows-affected condition with
|
||||
// repository.ErrNotFound so the handler's errToStatus dispatches to 404
|
||||
// via errors.Is without substring matching. Mirrors profile repository.
|
||||
if rows == 0 {
|
||||
return fmt.Errorf("%w: agent group %s", repository.ErrNotFound, group.ID)
|
||||
return fmt.Errorf("agent group not found: %s", group.ID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -105,11 +96,8 @@ func (r *AgentGroupRepository) Delete(ctx context.Context, id string) error {
|
||||
return fmt.Errorf("failed to delete agent group: %w", err)
|
||||
}
|
||||
rows, _ := result.RowsAffected()
|
||||
// M-1 (P2): wrap zero-rows-affected with repository.ErrNotFound so the
|
||||
// handler's errToStatus dispatches to 404 via errors.Is. Mirrors profile
|
||||
// repository.
|
||||
if rows == 0 {
|
||||
return fmt.Errorf("%w: agent group %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("agent group not found: %s", id)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -264,14 +264,8 @@ func (r *CertificateRepository) Get(ctx context.Context, id string) (*domain.Man
|
||||
|
||||
cert, err := r.scanCertificate(ctx, row)
|
||||
if err != nil {
|
||||
// M-1 (P2): wrap sql.ErrNoRows with repository.ErrNotFound via %w so
|
||||
// the handler's errToStatus choke point dispatches to 404 via
|
||||
// errors.Is instead of the pre-M-1 strings.Contains(err.Error(),
|
||||
// "not found") branch at handler/export.go (ExportPEM + ExportPKCS12).
|
||||
// scanCertificate already wraps sql.ErrNoRows via %w, so errors.Is
|
||||
// walks through. Mirrors profile and agent_group repositories.
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, fmt.Errorf("%w: certificate %s", repository.ErrNotFound, id)
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, fmt.Errorf("certificate not found")
|
||||
}
|
||||
return nil, fmt.Errorf("failed to query certificate: %w", err)
|
||||
}
|
||||
@@ -402,11 +396,8 @@ func (r *CertificateRepository) Update(ctx context.Context, cert *domain.Managed
|
||||
return fmt.Errorf("failed to get rows affected: %w", err)
|
||||
}
|
||||
|
||||
// M-1 (P2): zero-rows-affected → repository.ErrNotFound wrap so the
|
||||
// handler's errToStatus dispatches to 404 via errors.Is. Mirrors profile
|
||||
// and agent_group repositories.
|
||||
if rows == 0 {
|
||||
return fmt.Errorf("%w: certificate %s", repository.ErrNotFound, cert.ID)
|
||||
return fmt.Errorf("certificate not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -427,10 +418,8 @@ func (r *CertificateRepository) Archive(ctx context.Context, id string) error {
|
||||
return fmt.Errorf("failed to get rows affected: %w", err)
|
||||
}
|
||||
|
||||
// M-1 (P2): zero-rows-affected → repository.ErrNotFound wrap. Mirrors
|
||||
// profile and agent_group repositories.
|
||||
if rows == 0 {
|
||||
return fmt.Errorf("%w: certificate %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("certificate not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -594,16 +583,6 @@ func (r *CertificateRepository) GetLatestVersion(ctx context.Context, certID str
|
||||
v.KeySize = int(keySize.Int64)
|
||||
|
||||
if err != nil {
|
||||
// M-1 (P2): wrap sql.ErrNoRows with repository.ErrNotFound via %w so
|
||||
// the handler's errToStatus choke point dispatches to 404 via
|
||||
// errors.Is. The export service surfaces "no certificate version
|
||||
// found" on this path — pre-M-1 the handler branched on
|
||||
// strings.Contains(err.Error(), "not found"), now it walks the wrap
|
||||
// chain. Non-ErrNoRows errors preserve their "failed to get latest
|
||||
// certificate version" framing.
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, fmt.Errorf("%w: certificate version for %s", repository.ErrNotFound, certID)
|
||||
}
|
||||
return nil, fmt.Errorf("failed to get latest certificate version: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -2,14 +2,26 @@ package postgres
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
"github.com/lib/pq"
|
||||
)
|
||||
|
||||
// pgErrInvalidPassword is the SQLSTATE for class 28 / code 28P01 —
|
||||
// invalid_password — emitted by PostgreSQL when the client presents
|
||||
// credentials that don't match pg_authid. Defined locally because the
|
||||
// lib/pq package does not export named constants for SQLSTATE codes (it
|
||||
// only exposes the typed string alias pq.ErrorCode and a name-lookup map
|
||||
// at runtime). Pinned as a string constant rather than a pq.ErrorCode
|
||||
// literal so the contract is grep-able from operator-facing log lines.
|
||||
//
|
||||
// Reference: https://www.postgresql.org/docs/16/errcodes-appendix.html
|
||||
const pgErrInvalidPassword = "28P01"
|
||||
|
||||
// NewDB opens a PostgreSQL database connection and sets up connection pooling.
|
||||
func NewDB(connStr string) (*sql.DB, error) {
|
||||
db, err := sql.Open("postgres", connStr)
|
||||
@@ -23,12 +35,65 @@ func NewDB(connStr string) (*sql.DB, error) {
|
||||
|
||||
// Ping to verify connection
|
||||
if err := db.Ping(); err != nil {
|
||||
return nil, fmt.Errorf("failed to ping database: %w", err)
|
||||
return nil, wrapPingError(err)
|
||||
}
|
||||
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// wrapPingError converts a db.Ping() failure into an operator-friendly
|
||||
// diagnostic. The default wrap is the original opaque
|
||||
// `"failed to ping database: <inner>"` shape. The exception is SQLSTATE 28P01
|
||||
// (invalid_password): when postgres rejects the server's credentials we emit
|
||||
// extended guidance that names the most common operator misstep — editing
|
||||
// POSTGRES_PASSWORD in `.env` after the postgres named volume has already
|
||||
// been initialized — and lists both the destructive (`docker compose down -v`)
|
||||
// and non-destructive (`ALTER ROLE`) remediations.
|
||||
//
|
||||
// U-1 (P1, GitHub #10): closes the audit-flagged
|
||||
// cat-u-quickstart_postgres_password_volume_trap finding. The postgres
|
||||
// docker-entrypoint runs initdb only when /var/lib/postgresql/data is empty;
|
||||
// on subsequent boots the password baked into pg_authid on first boot wins
|
||||
// over whatever the env var carries, so the env-vs-pg_authid divergence is
|
||||
// intrinsic to how the postgres image bootstraps and cannot be fixed by us
|
||||
// upstream of pg_authid. The ergonomic answer is to surface a clear
|
||||
// diagnostic at the failure site so operators don't waste an hour on
|
||||
// "is my password right" before discovering the volume needs to be torn
|
||||
// down (or the role's password rotated in-place).
|
||||
//
|
||||
// The wrap chain is preserved via fmt.Errorf("%w", err) so callers using
|
||||
// errors.As(err, &*pq.Error) on the returned value continue to work; this
|
||||
// matches the audit's "no substring matching on err.Error()" requirement
|
||||
// from the M-1 sentinel-error migration.
|
||||
//
|
||||
// Returns nil when err is nil so callers can defensively pipe through this
|
||||
// helper without an extra branch.
|
||||
func wrapPingError(err error) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var pqErr *pq.Error
|
||||
if errors.As(err, &pqErr) && string(pqErr.Code) == pgErrInvalidPassword {
|
||||
return fmt.Errorf(
|
||||
"failed to ping database: postgres rejected the configured credentials "+
|
||||
"(SQLSTATE %s — invalid_password). If you recently rotated POSTGRES_PASSWORD "+
|
||||
"on a docker-compose deploy, the postgres container's data volume still "+
|
||||
"holds the previous password: initdb seeds POSTGRES_PASSWORD into pg_authid "+
|
||||
"only on first boot of a fresh data dir, so editing the env var after that "+
|
||||
"point updates only the certctl-server container. Reset destructively with "+
|
||||
"`docker compose -f deploy/docker-compose.yml down -v && "+
|
||||
"docker compose -f deploy/docker-compose.yml up -d --build` (this DESTROYS "+
|
||||
"all data in the postgres volume), or non-destructively with "+
|
||||
"`docker compose -f deploy/docker-compose.yml exec postgres "+
|
||||
"psql -U certctl -c \"ALTER ROLE certctl PASSWORD '<new-password>';\"` "+
|
||||
"and then redeploy with the matching POSTGRES_PASSWORD. Underlying error: %w",
|
||||
pgErrInvalidPassword, err)
|
||||
}
|
||||
|
||||
return fmt.Errorf("failed to ping database: %w", err)
|
||||
}
|
||||
|
||||
// RunMigrations reads and executes SQL migration files from a directory.
|
||||
func RunMigrations(db *sql.DB, migrationsPath string) error {
|
||||
// Check if migrations directory exists
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
// Internal-package tests for db.go — covers the diagnostic dispatch in
|
||||
// wrapPingError. Lives in `package postgres` (not `postgres_test`) so it can
|
||||
// call the unexported helper directly without exposing it on the API surface.
|
||||
//
|
||||
// Sibling integration tests in this directory live in `package postgres_test`
|
||||
// (testcontainers-driven, schema-per-test). They exercise the live-DB
|
||||
// happy path; this file owns the unit-level diagnostic dispatch and runs in
|
||||
// `-short` mode without spinning up postgres.
|
||||
//
|
||||
// U-1 (P1, GitHub #10): closes the audit-flagged
|
||||
// cat-u-quickstart_postgres_password_volume_trap finding by pinning the
|
||||
// post-fix wrap-text contract for `db.Ping()` failures. Pre-U-1 every Ping
|
||||
// error was wrapped with the same opaque `"failed to ping database: %w"`,
|
||||
// so an operator who edited POSTGRES_PASSWORD after first-boot saw only
|
||||
// `pq: password authentication failed for user "certctl"` in the server
|
||||
// log with no pointer to the actual cause (postgres data dir retains the
|
||||
// initial password from first-boot initdb; subsequent boots ignore the env
|
||||
// var). Post-U-1 the SQLSTATE-28P01 path emits a multi-line diagnostic
|
||||
// pointing at the down -v / ALTER ROLE remediation; non-auth failures
|
||||
// retain the original wrap shape so verbose noise does not bleed into
|
||||
// transient connection-refused / timeout paths.
|
||||
package postgres
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/lib/pq"
|
||||
)
|
||||
|
||||
// TestWrapPingError_AuthFailureGuidance asserts the diagnostic wrap fires on
|
||||
// SQLSTATE 28P01 (invalid_password) and contains all three contract elements:
|
||||
// the SQLSTATE code (so operators can grep), the down-v destructive
|
||||
// remediation, and the ALTER ROLE non-destructive remediation. Also asserts
|
||||
// the wrap chain still satisfies errors.As(err, &*pq.Error) so callers that
|
||||
// programmatically inspect the underlying postgres error code keep working.
|
||||
func TestWrapPingError_AuthFailureGuidance(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := &pq.Error{
|
||||
Code: pq.ErrorCode("28P01"),
|
||||
Message: `password authentication failed for user "certctl"`,
|
||||
}
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
// Contract elements — the operator-facing string is what we ship.
|
||||
wantSubstrings := []string{
|
||||
"SQLSTATE 28P01", // operators grep on this
|
||||
"POSTGRES_PASSWORD", // names the variable that traps
|
||||
"first boot", // the mechanism in plain language
|
||||
"down -v", // destructive remediation
|
||||
"ALTER ROLE", // non-destructive remediation
|
||||
}
|
||||
for _, s := range wantSubstrings {
|
||||
if !strings.Contains(got, s) {
|
||||
t.Errorf("wrap text missing %q\ngot: %s", s, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap chain must still expose the underlying *pq.Error for callers
|
||||
// that want to inspect Code / Detail / Constraint fields. Pre-fix
|
||||
// callers used errors.As(err, &pqErr) on the unwrapped Ping result;
|
||||
// the new wrap is fmt.Errorf("...%w", err) so errors.As must walk it.
|
||||
var pqErr *pq.Error
|
||||
if !errors.As(wrapped, &pqErr) {
|
||||
t.Fatalf("errors.As did not extract *pq.Error from wrapped chain: %v", wrapped)
|
||||
}
|
||||
if pqErr.Code != "28P01" {
|
||||
t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "28P01")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NonAuthErrorPreservesOriginalWrap guards against the
|
||||
// guidance text bleeding into unrelated failure modes. SQLSTATE 08006
|
||||
// (connection_failure) is the canonical non-auth case — server unreachable,
|
||||
// TLS handshake failure, network drop. The wrap should be the original
|
||||
// shape so transient-error log noise does not include the (now lengthy)
|
||||
// volume-state remediation paragraph.
|
||||
func TestWrapPingError_NonAuthErrorPreservesOriginalWrap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := &pq.Error{
|
||||
Code: pq.ErrorCode("08006"),
|
||||
Message: "connection refused",
|
||||
}
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
// Original-wrap shape: prefix only, no guidance text.
|
||||
const wantPrefix = "failed to ping database: "
|
||||
if !strings.HasPrefix(got, wantPrefix) {
|
||||
t.Errorf("expected prefix %q, got: %s", wantPrefix, got)
|
||||
}
|
||||
|
||||
// Negative assertions: guidance text MUST NOT appear on non-auth paths.
|
||||
mustNotContain := []string{
|
||||
"SQLSTATE 08006", // we only call out 28P01 specifically
|
||||
"POSTGRES_PASSWORD",
|
||||
"down -v",
|
||||
"ALTER ROLE",
|
||||
}
|
||||
for _, s := range mustNotContain {
|
||||
if strings.Contains(got, s) {
|
||||
t.Errorf("non-auth wrap leaked guidance substring %q\ngot: %s", s, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap chain still walks for errors.As — same contract as auth path.
|
||||
var pqErr *pq.Error
|
||||
if !errors.As(wrapped, &pqErr) {
|
||||
t.Fatalf("errors.As did not extract *pq.Error from non-auth wrapped chain: %v", wrapped)
|
||||
}
|
||||
if pqErr.Code != "08006" {
|
||||
t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "08006")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NonPqErrorPreservesOriginalWrap guards the network-level
|
||||
// case: a pre-handshake failure (TCP refused, DNS, TLS) returns a
|
||||
// non-*pq.Error from db.Ping(). errors.As must return false, the helper
|
||||
// must fall through to the generic wrap, and the chain must remain walkable.
|
||||
func TestWrapPingError_NonPqErrorPreservesOriginalWrap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := errors.New("dial tcp 127.0.0.1:5432: connect: connection refused")
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
const wantPrefix = "failed to ping database: "
|
||||
if !strings.HasPrefix(got, wantPrefix) {
|
||||
t.Errorf("expected prefix %q, got: %s", wantPrefix, got)
|
||||
}
|
||||
if strings.Contains(got, "SQLSTATE") || strings.Contains(got, "POSTGRES_PASSWORD") {
|
||||
t.Errorf("network-level wrap leaked SQLSTATE/postgres guidance\ngot: %s", got)
|
||||
}
|
||||
if !errors.Is(wrapped, original) {
|
||||
t.Errorf("errors.Is did not walk to original sentinel: %v", wrapped)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NilReturnsNil — defensive contract: if Ping returned nil
|
||||
// (no failure), the helper must not synthesize a fake error. This isn't on
|
||||
// the documented call path (NewDB only invokes wrapPingError inside the
|
||||
// `if err != nil` branch), but pinning it prevents a future refactor from
|
||||
// regressing the contract silently.
|
||||
func TestWrapPingError_NilReturnsNil(t *testing.T) {
|
||||
t.Parallel()
|
||||
if got := wrapPingError(nil); got != nil {
|
||||
t.Errorf("wrapPingError(nil) = %v, want nil", got)
|
||||
}
|
||||
}
|
||||
@@ -138,15 +138,7 @@ func (r *NotificationRepository) List(ctx context.Context, filter *repository.No
|
||||
return notifs, nil
|
||||
}
|
||||
|
||||
// UpdateStatus updates a notification's delivery status.
|
||||
//
|
||||
// M-1 (P2): zero-rows-affected now wraps repository.ErrNotFound via %w so the
|
||||
// handler's errToStatus choke point dispatches to 404 via errors.Is. Pre-M-1
|
||||
// the return was a raw fmt.Errorf("notification not found") string that the
|
||||
// service layer re-wrapped and the handler classified via strings.Contains —
|
||||
// one sentinel-message reword away from silently demoting 404 to 500. The
|
||||
// behavior (error on concurrent-delete / bad-id) is unchanged; only the error
|
||||
// identity is now type-safe.
|
||||
// UpdateStatus updates a notification's delivery status
|
||||
func (r *NotificationRepository) UpdateStatus(ctx context.Context, id string, status string, sentAt time.Time) error {
|
||||
result, err := r.db.ExecContext(ctx, `
|
||||
UPDATE notification_events SET status = $1, sent_at = $2 WHERE id = $3
|
||||
@@ -162,7 +154,7 @@ func (r *NotificationRepository) UpdateStatus(ctx context.Context, id string, st
|
||||
}
|
||||
|
||||
if rows == 0 {
|
||||
return fmt.Errorf("%w: notification %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("notification not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -315,12 +307,10 @@ func (r *NotificationRepository) RecordFailedAttempt(ctx context.Context, id str
|
||||
return fmt.Errorf("failed to get rows affected: %w", err)
|
||||
}
|
||||
if rows == 0 {
|
||||
// M-1 (P2): wrap repository.ErrNotFound (was raw
|
||||
// fmt.Errorf("notification not found")). Same "not found" shape as
|
||||
// UpdateStatus — the scheduler logs-and-continues on a concurrently
|
||||
// deleted row, but callers that surface the error (the handler) now
|
||||
// discriminate via errors.Is instead of substring matching.
|
||||
return fmt.Errorf("%w: notification %s", repository.ErrNotFound, id)
|
||||
// Same "not found" error shape as UpdateStatus above. The scheduler
|
||||
// logs-and-continues on this so a concurrently-deleted row doesn't
|
||||
// break the sweep.
|
||||
return fmt.Errorf("notification not found")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -352,8 +342,7 @@ func (r *NotificationRepository) MarkAsDead(ctx context.Context, id string, last
|
||||
return fmt.Errorf("failed to get rows affected: %w", err)
|
||||
}
|
||||
if rows == 0 {
|
||||
// M-1 (P2): wrap repository.ErrNotFound. See UpdateStatus rationale.
|
||||
return fmt.Errorf("%w: notification %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("notification not found")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -390,8 +379,7 @@ func (r *NotificationRepository) Requeue(ctx context.Context, id string) error {
|
||||
return fmt.Errorf("failed to get rows affected: %w", err)
|
||||
}
|
||||
if rows == 0 {
|
||||
// M-1 (P2): wrap repository.ErrNotFound. See UpdateStatus rationale.
|
||||
return fmt.Errorf("%w: notification %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("notification not found")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -4,13 +4,11 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/repository"
|
||||
)
|
||||
|
||||
// ProfileRepository implements repository.CertificateProfileRepository
|
||||
@@ -65,11 +63,8 @@ func (r *ProfileRepository) Get(ctx context.Context, id string) (*domain.Certifi
|
||||
|
||||
p, err := scanProfile(row)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
// M-1: wrap repository.ErrNotFound so the handler's errToStatus
|
||||
// choke point can route this to HTTP 404 via errors.Is without
|
||||
// substring-matching the "not found" message text.
|
||||
return nil, fmt.Errorf("%w: profile %s", repository.ErrNotFound, id)
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, fmt.Errorf("profile not found")
|
||||
}
|
||||
return nil, fmt.Errorf("failed to query profile: %w", err)
|
||||
}
|
||||
@@ -164,8 +159,7 @@ func (r *ProfileRepository) Update(ctx context.Context, profile *domain.Certific
|
||||
}
|
||||
|
||||
if rows == 0 {
|
||||
// M-1: wrap repository.ErrNotFound — see Get for rationale.
|
||||
return fmt.Errorf("%w: profile %s", repository.ErrNotFound, profile.ID)
|
||||
return fmt.Errorf("profile not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -184,8 +178,7 @@ func (r *ProfileRepository) Delete(ctx context.Context, id string) error {
|
||||
}
|
||||
|
||||
if rows == 0 {
|
||||
// M-1: wrap repository.ErrNotFound — see Get for rationale.
|
||||
return fmt.Errorf("%w: profile %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("profile not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -72,10 +72,7 @@ func (r *RenewalPolicyRepository) Get(ctx context.Context, id string) (*domain.R
|
||||
policy, err := scanRenewalPolicy(row)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
// M-1: wrap repository.ErrNotFound so the handler's errToStatus
|
||||
// choke point can route this to HTTP 404 via errors.Is without
|
||||
// substring-matching the "not found" message text.
|
||||
return nil, fmt.Errorf("%w: renewal policy %s", repository.ErrNotFound, id)
|
||||
return nil, fmt.Errorf("renewal policy not found: %s", id)
|
||||
}
|
||||
return nil, fmt.Errorf("failed to query renewal policy: %w", err)
|
||||
}
|
||||
@@ -255,8 +252,7 @@ func (r *RenewalPolicyRepository) Update(ctx context.Context, id string, policy
|
||||
updated, err := scanRenewalPolicy(row)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
// M-1: wrap repository.ErrNotFound — see Get for rationale.
|
||||
return fmt.Errorf("%w: renewal policy %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("renewal policy not found: %s", id)
|
||||
}
|
||||
if isUniqueViolation(err) {
|
||||
return repository.ErrRenewalPolicyDuplicateName
|
||||
@@ -287,8 +283,7 @@ func (r *RenewalPolicyRepository) Delete(ctx context.Context, id string) error {
|
||||
return fmt.Errorf("failed to read RowsAffected for delete: %w", err)
|
||||
}
|
||||
if rows == 0 {
|
||||
// M-1: wrap repository.ErrNotFound — see Get for rationale.
|
||||
return fmt.Errorf("%w: renewal policy %s", repository.ErrNotFound, id)
|
||||
return fmt.Errorf("renewal policy not found: %s", id)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -6,12 +6,10 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/lib/pq"
|
||||
"github.com/shankar0123/certctl/internal/domain"
|
||||
"github.com/shankar0123/certctl/internal/repository"
|
||||
)
|
||||
@@ -411,16 +409,6 @@ func (s *AgentService) RegisterAgent(ctx context.Context, agent domain.Agent) (*
|
||||
agent.LastHeartbeatAt = &now
|
||||
|
||||
if err := s.agentRepo.Create(ctx, &agent); err != nil {
|
||||
// M-1 (P2): explicit pg-23505 dispatch wraps unique-constraint
|
||||
// violations on the agent name with ErrConflict so the handler's
|
||||
// errors.Is(err, service.ErrConflict) arm fires 409 deterministically
|
||||
// instead of relying on the errToStatus *pq.Error SQLSTATE fallback.
|
||||
// Mirrors the duplicate-name paths in IssuerService and
|
||||
// RenewalPolicyService.
|
||||
var pgErr *pq.Error
|
||||
if errors.As(err, &pgErr) && pgErr.Code == "23505" {
|
||||
return nil, fmt.Errorf("%w: agent name already exists", ErrConflict)
|
||||
}
|
||||
return nil, fmt.Errorf("failed to register agent: %w", err)
|
||||
}
|
||||
return &agent, nil
|
||||
|
||||
@@ -143,20 +143,12 @@ func (s *AgentGroupService) ListMembers(ctx context.Context, id string) ([]domai
|
||||
}
|
||||
|
||||
// validateAgentGroup checks that an agent group's configuration is valid.
|
||||
//
|
||||
// M-1 (P2): every return wraps ErrValidation via %w so the handler's
|
||||
// errToStatus choke point dispatches these to HTTP 400 via errors.Is without
|
||||
// the substring-matching on "invalid"/"required" that the pre-M-1
|
||||
// agent_groups handler relied on at handler/agent_groups.go:126. The composed
|
||||
// Error() string still contains the original human-readable text, so the
|
||||
// handler safely passes err.Error() through to the response body on the 400
|
||||
// arm. Mirrors validateProfile.
|
||||
func validateAgentGroup(g *domain.AgentGroup) error {
|
||||
if g.Name == "" {
|
||||
return fmt.Errorf("%w: agent group name is required", ErrValidation)
|
||||
return fmt.Errorf("agent group name is required")
|
||||
}
|
||||
if len(g.Name) > 255 {
|
||||
return fmt.Errorf("%w: agent group name exceeds 255 characters", ErrValidation)
|
||||
return fmt.Errorf("agent group name exceeds 255 characters")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -26,12 +26,8 @@ import (
|
||||
// three cloud secret-manager discovery sources; retiring any of them orphans
|
||||
// its subsystem. The guard fires unconditionally — force=true does not bypass
|
||||
// it, because a sentinel is a structural invariant of the deployment, not
|
||||
// a piece of fleet state the operator owns.
|
||||
//
|
||||
// M-1 (P2): wraps [ErrForbidden] so the handler-layer errToStatus choke point
|
||||
// resolves HTTP 403 via errors.Is. Call sites can still errors.Is against
|
||||
// ErrAgentIsSentinel for domain-specific branches.
|
||||
var ErrAgentIsSentinel = fmt.Errorf("%w: agent is a reserved sentinel and cannot be retired", ErrForbidden)
|
||||
// a piece of fleet state the operator owns. Handler maps this to HTTP 403.
|
||||
var ErrAgentIsSentinel = errors.New("agent is a reserved sentinel and cannot be retired")
|
||||
|
||||
// ErrBlockedByDependencies is returned by RetireAgent when at least one of
|
||||
// (active targets, active certificates, pending jobs) referencing the agent
|
||||
@@ -39,22 +35,15 @@ var ErrAgentIsSentinel = fmt.Errorf("%w: agent is a reserved sentinel and cannot
|
||||
// a *BlockedByDependenciesError (see below), so handlers doing errors.As
|
||||
// can surface the per-bucket counts in the 409 body for operator
|
||||
// troubleshooting. Tests use errors.Is; handlers use errors.As.
|
||||
//
|
||||
// M-1 (P2): wraps [ErrConflict] so errToStatus resolves HTTP 409 via
|
||||
// errors.Is, keeping the per-bucket errors.As path unchanged for the
|
||||
// detailed 409 body. BlockedByDependenciesError.Unwrap still returns the
|
||||
// domain-specific sentinel for chained Is checks.
|
||||
var ErrBlockedByDependencies = fmt.Errorf("%w: agent has active downstream dependencies", ErrConflict)
|
||||
var ErrBlockedByDependencies = errors.New("agent has active downstream dependencies")
|
||||
|
||||
// ErrForceReasonRequired is returned when force=true is supplied without a
|
||||
// non-empty reason. The force escape hatch is deliberately chatty: operators
|
||||
// pulling the emergency cord must leave an auditable breadcrumb explaining
|
||||
// why a cascade was justified. Operators must retry with --reason rather
|
||||
// than silently skipping the guard. Checked before any DB mutation to keep
|
||||
// the no-reason path transactionally clean.
|
||||
//
|
||||
// M-1 (P2): wraps [ErrValidation] so errToStatus resolves HTTP 400.
|
||||
var ErrForceReasonRequired = fmt.Errorf("%w: force=true requires a non-empty reason", ErrValidation)
|
||||
// why a cascade was justified. Handler maps this to HTTP 400 so the operator
|
||||
// retries with --reason rather than silently skipping the guard. Checked
|
||||
// before any DB mutation to keep the no-reason path transactionally clean.
|
||||
var ErrForceReasonRequired = errors.New("force=true requires a non-empty reason")
|
||||
|
||||
// ErrAgentRetired is returned by Heartbeat (and any future agent-authenticated
|
||||
// call site) when a retired agent is still polling. The handler layer maps
|
||||
@@ -63,12 +52,6 @@ var ErrForceReasonRequired = fmt.Errorf("%w: force=true requires a non-empty rea
|
||||
// forever on a soft-retired identity. IsRetired() on the domain model is
|
||||
// the single source of truth; the sentinel exists so service and handler
|
||||
// callers can errors.Is against one symbol.
|
||||
//
|
||||
// M-1 (P2): deliberately NOT wrapped under any generic sentinel. 410 Gone is
|
||||
// semantically distinct from 403/404/409 — it signals a permanently-terminated
|
||||
// resource identity and drives deterministic agent-binary shutdown (see
|
||||
// cmd/agent/main.go). errToStatus tests it FIRST so it cannot be demoted by
|
||||
// the generic cascade.
|
||||
var ErrAgentRetired = errors.New("agent has been retired")
|
||||
|
||||
// BlockedByDependenciesError wraps ErrBlockedByDependencies and carries the
|
||||
|
||||
@@ -143,15 +143,6 @@ func (s *AuditService) ListAuditEvents(ctx context.Context, page, perPage int) (
|
||||
}
|
||||
|
||||
// GetAuditEvent returns a single audit event (handler interface method).
|
||||
//
|
||||
// M-1 (P2): the pre-M-1 zero-events path returned a bare
|
||||
// `fmt.Errorf("audit event not found")` and the handler dispatched via a
|
||||
// blanket `any error → 404 Audit event not found` shortcut. That silently
|
||||
// demoted transient DB failures from the auditRepo.List wrap (line 154 above)
|
||||
// to 404 Not Found. Now the zero-events path wraps ErrNotFound via %w so
|
||||
// errors.Is(err, service.ErrNotFound) picks up the real 404 at the handler's
|
||||
// errToStatus choke point, and the repo.List wrap surfaces as 500 with
|
||||
// server-side slog.Error capture (F-002 redacted-500 pattern preserved).
|
||||
func (s *AuditService) GetAuditEvent(ctx context.Context, id string) (*domain.AuditEvent, error) {
|
||||
filter := &repository.AuditFilter{
|
||||
ResourceID: id,
|
||||
@@ -164,7 +155,7 @@ func (s *AuditService) GetAuditEvent(ctx context.Context, id string) (*domain.Au
|
||||
}
|
||||
|
||||
if len(events) == 0 {
|
||||
return nil, fmt.Errorf("%w: audit event not found", ErrNotFound)
|
||||
return nil, fmt.Errorf("audit event not found")
|
||||
}
|
||||
|
||||
return events[0], nil
|
||||
|
||||
@@ -43,15 +43,6 @@ func (s *CAOperationsSvc) SetIssuerRegistry(registry *IssuerRegistry) {
|
||||
|
||||
// GenerateDERCRL generates a DER-encoded X.509 CRL for the given issuer.
|
||||
// Short-lived certificates (profile TTL < 1 hour) are excluded from the CRL.
|
||||
//
|
||||
// M-1 (P2): the pre-M-1 issuer-not-found path returned a bare
|
||||
// `fmt.Errorf("issuer not found: %s", issuerID)` with no sentinel wrap. A
|
||||
// pre-M-1 handler strings.Contains(err.Error(), "not found") classifier would
|
||||
// sweep both this path and any transient error whose text happened to mention
|
||||
// "not found" into 404. Post-M-1 this path wraps service.ErrNotFound via %w so
|
||||
// errors.Is picks up the genuine 404 at errToStatus, and truly-unexpected
|
||||
// errors (e.g., registry misconfigured) surface as 500 via the generic-error
|
||||
// fallback.
|
||||
func (s *CAOperationsSvc) GenerateDERCRL(ctx context.Context, issuerID string) ([]byte, error) {
|
||||
if s.revocationRepo == nil {
|
||||
return nil, fmt.Errorf("revocation repository not configured")
|
||||
@@ -62,7 +53,7 @@ func (s *CAOperationsSvc) GenerateDERCRL(ctx context.Context, issuerID string) (
|
||||
|
||||
issuerConn, ok := s.issuerRegistry.Get(issuerID)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%w: issuer %s", ErrNotFound, issuerID)
|
||||
return nil, fmt.Errorf("issuer not found: %s", issuerID)
|
||||
}
|
||||
|
||||
// Scope the query to this issuer so the migration 000012 composite index
|
||||
@@ -107,11 +98,6 @@ func (s *CAOperationsSvc) GenerateDERCRL(ctx context.Context, issuerID string) (
|
||||
}
|
||||
|
||||
// GetOCSPResponse generates a signed OCSP response for the given certificate serial.
|
||||
//
|
||||
// M-1 (P2): see GenerateDERCRL above — same sentinel-wrap rationale applies.
|
||||
// The issuer-not-found path wraps service.ErrNotFound via %w so errors.Is
|
||||
// picks up the genuine 404 at errToStatus; transient/misconfiguration errors
|
||||
// surface as 500.
|
||||
func (s *CAOperationsSvc) GetOCSPResponse(ctx context.Context, issuerID string, serialHex string) ([]byte, error) {
|
||||
if s.revocationRepo == nil {
|
||||
return nil, fmt.Errorf("revocation repository not configured")
|
||||
@@ -122,7 +108,7 @@ func (s *CAOperationsSvc) GetOCSPResponse(ctx context.Context, issuerID string,
|
||||
|
||||
issuerConn, ok := s.issuerRegistry.Get(issuerID)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%w: issuer %s", ErrNotFound, issuerID)
|
||||
return nil, fmt.Errorf("issuer not found: %s", issuerID)
|
||||
}
|
||||
|
||||
serial := new(big.Int)
|
||||
|
||||
@@ -376,21 +376,10 @@ func (s *CertificateService) CreateCertificate(ctx context.Context, cert domain.
|
||||
|
||||
// UpdateCertificate modifies a certificate (handler interface method).
|
||||
func (s *CertificateService) UpdateCertificate(ctx context.Context, id string, patch domain.ManagedCertificate) (*domain.ManagedCertificate, error) {
|
||||
// Fetch existing certificate so partial updates don't zero out fields.
|
||||
//
|
||||
// M-1 (P2): the pre-M-1 wrap was `"certificate not found: %w"` on every
|
||||
// certRepo.Get error — which coupled to the handler's strings.Contains
|
||||
// substring classifier on "not found" and gave false positives on transient
|
||||
// DB failures (connection refused, context deadline, etc.), demoting a 500
|
||||
// to a 404. Now the repo wraps only the genuine sql.ErrNoRows path with
|
||||
// repository.ErrNotFound (certificate.go Get), so the errors.Is walk
|
||||
// through the handler's errToStatus choke point discriminates correctly:
|
||||
// truly-missing → 404, everything else → 500 (the intended outcome). The
|
||||
// wrap text is changed from "certificate not found" to "failed to get
|
||||
// certificate" to match the semantic.
|
||||
// Fetch existing certificate so partial updates don't zero out fields
|
||||
existing, err := s.certRepo.Get(ctx, id)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get certificate: %w", err)
|
||||
return nil, fmt.Errorf("certificate not found: %w", err)
|
||||
}
|
||||
|
||||
// Merge non-zero fields from patch into existing
|
||||
@@ -512,14 +501,10 @@ func (s *CertificateService) GetOCSPResponse(ctx context.Context, issuerID strin
|
||||
|
||||
// GetCertificateDeployments returns all deployment targets for a certificate (M20).
|
||||
func (s *CertificateService) GetCertificateDeployments(ctx context.Context, certID string) ([]domain.DeploymentTarget, error) {
|
||||
// Verify certificate exists. See M-1 (P2) note on UpdateCertificate for
|
||||
// the wrap-text correction rationale — same treatment applies here: the
|
||||
// repo's sql.ErrNoRows wrap with repository.ErrNotFound escapes cleanly
|
||||
// through fmt.Errorf("%w", ...) so errors.Is picks up the genuine 404
|
||||
// while transient DB errors correctly surface as 500.
|
||||
// Verify certificate exists
|
||||
_, err := s.certRepo.Get(ctx, certID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get certificate: %w", err)
|
||||
return nil, fmt.Errorf("certificate not found: %w", err)
|
||||
}
|
||||
|
||||
if s.targetRepo == nil {
|
||||
|
||||
@@ -166,20 +166,9 @@ func (s *DiscoveryService) ClaimDiscovered(ctx context.Context, id string, manag
|
||||
return fmt.Errorf("failed to get discovered certificate: %w", err)
|
||||
}
|
||||
|
||||
// Verify the managed cert exists.
|
||||
//
|
||||
// M-1 (P2): the pre-M-1 wrap was a bare `fmt.Errorf("managed certificate not
|
||||
// found: %s", managedCertID)` with no %w and no sentinel. That threw away the
|
||||
// underlying error entirely — a transient DB failure from certRepo.Get (e.g.,
|
||||
// connection dropped, query timeout) would be flattened into the same "not
|
||||
// found" string as a genuine sql.ErrNoRows, which the pre-M-1 handler
|
||||
// strings.Contains classifier then swept into 404. Post-M-1 we propagate the
|
||||
// underlying error via %w so errors.Is walks the chain correctly:
|
||||
// certRepo.Get wraps genuine sql.ErrNoRows with repository.ErrNotFound (see
|
||||
// postgres/certificate.go Get), and errToStatus dispatches that to 404;
|
||||
// transient errors surface as 500 via the generic-error fallback.
|
||||
// Verify the managed cert exists
|
||||
if _, err := s.certRepo.Get(ctx, managedCertID); err != nil {
|
||||
return fmt.Errorf("failed to get managed certificate %s: %w", managedCertID, err)
|
||||
return fmt.Errorf("managed certificate not found: %s", managedCertID)
|
||||
}
|
||||
|
||||
if err := s.discoveryRepo.UpdateDiscoveredStatus(ctx, id, domain.DiscoveryStatusManaged, managedCertID); err != nil {
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
package service
|
||||
|
||||
import "errors"
|
||||
|
||||
// M-1 (P2) coverage-gap closure: generic service-layer error sentinels.
|
||||
//
|
||||
// Before M-1, API handlers classified service errors by substring-matching the
|
||||
// wrapped message text (`strings.Contains(err.Error(), "not found")` and
|
||||
// friends). That made every HTTP status mapping one `fmt.Errorf` reword away
|
||||
// from silently regressing — including the M-003 self-approval privilege
|
||||
// boundary, where `handler/jobs.go:174/:220` ignored the already-defined
|
||||
// ErrSelfApproval sentinel and relied on the literal string "cannot approve".
|
||||
//
|
||||
// These six generic sentinels form the type-safe surface the handler layer
|
||||
// dispatches against via `errors.Is`. Domain-specific sentinels (ErrSelfApproval,
|
||||
// ErrAgentIsSentinel, ErrBlockedByDependencies, ErrForceReasonRequired,
|
||||
// ErrAgentNotFound) are declared in their own topical files (job.go,
|
||||
// agent_retire.go, target.go) and wrap one of these generics via
|
||||
// `fmt.Errorf("%w: ...", ErrForbidden)`. The wrap chain lets call sites continue
|
||||
// to `errors.Is(err, ErrSelfApproval)` for domain-specific logic while the
|
||||
// handler's single choke point in `api/handler/errors.go` can match on the
|
||||
// generic sentinel to pick the HTTP status.
|
||||
//
|
||||
// Dispatch order in errToStatus matters — see the doc block at the top of
|
||||
// `internal/api/handler/errors.go`.
|
||||
//
|
||||
// ErrAgentRetired is deliberately NOT wrapped here. 410 Gone is semantically
|
||||
// distinct from 403/404/409 and must short-circuit the generic dispatch. Keep
|
||||
// its standalone declaration in agent_retire.go untouched; errToStatus tests
|
||||
// it first.
|
||||
var (
|
||||
// ErrNotFound indicates a lookup for a resource that does not exist.
|
||||
// Handlers translate this to HTTP 404.
|
||||
ErrNotFound = errors.New("not found")
|
||||
|
||||
// ErrValidation indicates malformed, missing, or out-of-range input from
|
||||
// the caller. Handlers translate this to HTTP 400.
|
||||
ErrValidation = errors.New("validation failed")
|
||||
|
||||
// ErrConflict indicates a state conflict: unique-constraint violation,
|
||||
// foreign-key dependency, or a state machine transition that is not
|
||||
// allowed from the current state. Handlers translate this to HTTP 409.
|
||||
ErrConflict = errors.New("conflict")
|
||||
|
||||
// ErrForbidden indicates an authorization / privilege-boundary denial.
|
||||
// The caller is authenticated but is not permitted to perform the action.
|
||||
// Handlers translate this to HTTP 403.
|
||||
ErrForbidden = errors.New("forbidden")
|
||||
|
||||
// ErrUnauthenticated indicates the caller failed to authenticate — most
|
||||
// commonly a SCEP challenge-password mismatch, where the transport itself
|
||||
// is valid but the application-layer credential is wrong. Handlers
|
||||
// translate this to HTTP 401.
|
||||
ErrUnauthenticated = errors.New("unauthenticated")
|
||||
|
||||
// ErrNotImplemented indicates the requested operation is defined but not
|
||||
// yet wired up — reserved for feature-flag-gated code paths. Handlers
|
||||
// translate this to HTTP 501.
|
||||
ErrNotImplemented = errors.New("not implemented")
|
||||
|
||||
// ErrUnprocessable indicates the request was well-formed and the
|
||||
// referenced resource exists, but server-side stored data could not be
|
||||
// processed — e.g., a certificate PEM in inventory that fails X.509
|
||||
// decoding because the stored blob is corrupt or was inserted with the
|
||||
// wrong encoding. Distinct from ErrValidation: ErrValidation means the
|
||||
// caller sent bad input (400), while ErrUnprocessable means the caller's
|
||||
// input was fine but our own data cannot satisfy the operation (422
|
||||
// Unprocessable Entity). Today the only call site is ExportPKCS12's parse
|
||||
// path in internal/service/export.go; keeping the sentinel generic so
|
||||
// other "stored-data-unparseable" paths can reuse it without inventing a
|
||||
// second variant.
|
||||
ErrUnprocessable = errors.New("unprocessable")
|
||||
)
|
||||
@@ -1,110 +0,0 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestGenericSentinels_IdentityDistinct guards against an accidental
|
||||
// `var ErrX = ErrY` alias where two generic sentinels share identity. Each
|
||||
// must be a distinct error value so errors.Is dispatch in errToStatus can
|
||||
// route them to different HTTP status codes.
|
||||
func TestGenericSentinels_IdentityDistinct(t *testing.T) {
|
||||
sentinels := []struct {
|
||||
name string
|
||||
err error
|
||||
}{
|
||||
{"ErrNotFound", ErrNotFound},
|
||||
{"ErrValidation", ErrValidation},
|
||||
{"ErrConflict", ErrConflict},
|
||||
{"ErrForbidden", ErrForbidden},
|
||||
{"ErrUnauthenticated", ErrUnauthenticated},
|
||||
{"ErrNotImplemented", ErrNotImplemented},
|
||||
}
|
||||
for i := range sentinels {
|
||||
for j := range sentinels {
|
||||
if i == j {
|
||||
continue
|
||||
}
|
||||
if errors.Is(sentinels[i].err, sentinels[j].err) {
|
||||
t.Errorf("%s and %s alias the same error value — each generic sentinel must be distinct",
|
||||
sentinels[i].name, sentinels[j].name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrappedSentinels_ChainWalk is the core M-1 invariant: wrapping a
|
||||
// domain-specific sentinel under a generic sentinel via fmt.Errorf("%w: ...")
|
||||
// must preserve BOTH identities on the wrap chain. Call sites that check
|
||||
// errors.Is(err, ErrSelfApproval) for domain logic AND the handler-layer
|
||||
// errToStatus that checks errors.Is(err, ErrForbidden) for the HTTP status
|
||||
// both need to succeed on the same error value.
|
||||
//
|
||||
// If this test fails, every handler dispatch that routes through errToStatus
|
||||
// is silently broken.
|
||||
func TestWrappedSentinels_ChainWalk(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
err error
|
||||
generic error
|
||||
}{
|
||||
{"ErrSelfApproval wraps ErrForbidden", ErrSelfApproval, ErrForbidden},
|
||||
{"ErrAgentIsSentinel wraps ErrForbidden", ErrAgentIsSentinel, ErrForbidden},
|
||||
{"ErrBlockedByDependencies wraps ErrConflict", ErrBlockedByDependencies, ErrConflict},
|
||||
{"ErrForceReasonRequired wraps ErrValidation", ErrForceReasonRequired, ErrValidation},
|
||||
{"ErrAgentNotFound wraps ErrValidation", ErrAgentNotFound, ErrValidation},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
if !errors.Is(c.err, c.generic) {
|
||||
t.Errorf("errors.Is(%v, %v) = false; want true", c.err, c.generic)
|
||||
}
|
||||
if !errors.Is(c.err, c.err) {
|
||||
t.Errorf("errors.Is(err, err) = false; want true — domain sentinel lost self-identity after wrap")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestErrAgentRetired_StandaloneGone locks the 410 Gone semantics in place.
|
||||
// ErrAgentRetired MUST NOT wrap any generic sentinel — 410 is semantically
|
||||
// distinct from 403/404/409 (permanently-terminated resource identity) and
|
||||
// the errToStatus dispatch tests it FIRST before any generic check. If this
|
||||
// test goes red because someone wrapped it under ErrForbidden or ErrNotFound,
|
||||
// the agent-binary shutdown behavior at cmd/agent/main.go:1291 silently
|
||||
// regresses.
|
||||
func TestErrAgentRetired_StandaloneGone(t *testing.T) {
|
||||
if errors.Is(ErrAgentRetired, ErrForbidden) {
|
||||
t.Error("ErrAgentRetired must NOT wrap ErrForbidden — 410 Gone would demote to 403")
|
||||
}
|
||||
if errors.Is(ErrAgentRetired, ErrNotFound) {
|
||||
t.Error("ErrAgentRetired must NOT wrap ErrNotFound — 410 Gone would demote to 404")
|
||||
}
|
||||
if errors.Is(ErrAgentRetired, ErrConflict) {
|
||||
t.Error("ErrAgentRetired must NOT wrap ErrConflict — 410 Gone would demote to 409")
|
||||
}
|
||||
if errors.Is(ErrAgentRetired, ErrValidation) {
|
||||
t.Error("ErrAgentRetired must NOT wrap ErrValidation — 410 Gone would demote to 400")
|
||||
}
|
||||
if !errors.Is(ErrAgentRetired, ErrAgentRetired) {
|
||||
t.Error("ErrAgentRetired lost self-identity")
|
||||
}
|
||||
}
|
||||
|
||||
// TestSentinels_SurviveDoubleWrap verifies that wrapping a sentinel-wrapped
|
||||
// error a SECOND time (e.g., a call site doing fmt.Errorf("%w: ctx-info",
|
||||
// ErrSelfApproval)) preserves both the domain sentinel and the generic
|
||||
// sentinel. This is critical because the errToStatus dispatch order places
|
||||
// ErrForbidden BEFORE ErrValidation — if double-wrapping broke the chain,
|
||||
// the M-003 gate would demote to 400.
|
||||
func TestSentinels_SurviveDoubleWrap(t *testing.T) {
|
||||
doubled := fmt.Errorf("%w: additional context from call site", ErrSelfApproval)
|
||||
if !errors.Is(doubled, ErrSelfApproval) {
|
||||
t.Error("double-wrapped ErrSelfApproval lost domain identity")
|
||||
}
|
||||
if !errors.Is(doubled, ErrForbidden) {
|
||||
t.Error("double-wrapped ErrSelfApproval lost ErrForbidden wrap — M-003 gate would demote to 500")
|
||||
}
|
||||
}
|
||||
+11
-37
@@ -38,31 +38,16 @@ type ExportPEMResult struct {
|
||||
|
||||
// ExportPEM returns the PEM-encoded certificate and chain for the latest version.
|
||||
func (s *ExportService) ExportPEM(ctx context.Context, certID string) (*ExportPEMResult, error) {
|
||||
// Verify certificate exists.
|
||||
//
|
||||
// M-1 (P2): the pre-M-1 wrap was `"certificate not found: %w"` on every
|
||||
// certRepo.Get error — which gave the handler's strings.Contains(err.Error(),
|
||||
// "not found") check a false positive on transient DB failures (connection
|
||||
// refused, context deadline, etc.), demoting a 500 to a 404. Now the repo
|
||||
// wraps only the genuine sql.ErrNoRows path with repository.ErrNotFound
|
||||
// (certificate.go Get), so the errors.Is walk through the handler's
|
||||
// errToStatus discriminates correctly: truly-missing → 404, everything else
|
||||
// → 500 (the intended outcome). The wrap text is changed from "certificate
|
||||
// not found" to "failed to get certificate" to match the semantic.
|
||||
// Verify certificate exists
|
||||
cert, err := s.certRepo.Get(ctx, certID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get certificate: %w", err)
|
||||
return nil, fmt.Errorf("certificate not found: %w", err)
|
||||
}
|
||||
|
||||
// Get latest version (contains the PEM chain).
|
||||
//
|
||||
// M-1 (P2): same wrap-text correction as above — pre-M-1 any
|
||||
// GetLatestVersion error surfaced as "no certificate version found",
|
||||
// which bled into the handler's substring classifier. Now the repo wraps
|
||||
// sql.ErrNoRows with repository.ErrNotFound; the wrap chain walks cleanly.
|
||||
// Get latest version (contains the PEM chain)
|
||||
version, err := s.certRepo.GetLatestVersion(ctx, certID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get latest certificate version: %w", err)
|
||||
return nil, fmt.Errorf("no certificate version found: %w", err)
|
||||
}
|
||||
|
||||
// Split PEM chain into leaf cert + chain
|
||||
@@ -88,37 +73,26 @@ func (s *ExportService) ExportPEM(ctx context.Context, certID string) (*ExportPE
|
||||
// The private key is NOT included — it lives on the agent and never touches the control plane.
|
||||
// The PKCS#12 bundle is encrypted with the provided password (can be empty for cert-only bundles).
|
||||
func (s *ExportService) ExportPKCS12(ctx context.Context, certID string, password string) ([]byte, error) {
|
||||
// Verify certificate exists. See M-1 (P2) note on ExportPEM for the wrap-text
|
||||
// correction — same rationale applies here.
|
||||
// Verify certificate exists
|
||||
cert, err := s.certRepo.Get(ctx, certID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get certificate: %w", err)
|
||||
return nil, fmt.Errorf("certificate not found: %w", err)
|
||||
}
|
||||
|
||||
// Get latest version. Same wrap-text correction as ExportPEM.
|
||||
// Get latest version
|
||||
version, err := s.certRepo.GetLatestVersion(ctx, certID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get latest certificate version: %w", err)
|
||||
return nil, fmt.Errorf("no certificate version found: %w", err)
|
||||
}
|
||||
|
||||
// Parse PEM chain into x509.Certificate objects.
|
||||
//
|
||||
// M-1 (P2): wrap both parse-failure paths with ErrUnprocessable so the
|
||||
// handler's errToStatus choke point dispatches to 422 Unprocessable Entity
|
||||
// via errors.Is instead of the pre-M-1 two-term substring net
|
||||
// (`"cannot be parsed"|"no certificates found"`) at handler/export.go:101.
|
||||
// 422 is the correct status here — the caller's request is syntactically
|
||||
// fine; the stored PEM chain is what can't be processed. The composed
|
||||
// Error() string still carries the "certificate data cannot be parsed as
|
||||
// X.509"/"no certificates found in PEM chain" wording so server-side
|
||||
// slog.Error capture and any future 422 body propagation stay readable.
|
||||
// Parse PEM chain into x509.Certificate objects
|
||||
certs, err := parsePEMCertificates(version.PEMChain)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: certificate data cannot be parsed as X.509: %v", ErrUnprocessable, err)
|
||||
return nil, fmt.Errorf("certificate data cannot be parsed as X.509: %w", err)
|
||||
}
|
||||
|
||||
if len(certs) == 0 {
|
||||
return nil, fmt.Errorf("%w: no certificates found in PEM chain", ErrUnprocessable)
|
||||
return nil, fmt.Errorf("no certificates found in PEM chain")
|
||||
}
|
||||
|
||||
// Build PKCS#12 bundle: leaf cert + CA chain (no private key)
|
||||
|
||||
@@ -262,18 +262,10 @@ func (s *IssuerService) Delete(ctx context.Context, id string, actor string) err
|
||||
|
||||
// TestConnection tests the connection to an issuer by instantiating a throwaway
|
||||
// connector and calling ValidateConfig. Records the result in the database.
|
||||
//
|
||||
// M-1 (P2): the pre-M-1 wrap was `"issuer not found: %w"` on every
|
||||
// issuerRepo.Get error — which coupled to handler substring classifiers on
|
||||
// "not found" and could have demoted transient DB failures to 404. Now the
|
||||
// repo wraps only the genuine sql.ErrNoRows path with repository.ErrNotFound
|
||||
// (postgres/issuer.go Get), so errors.Is walks the wrap chain correctly:
|
||||
// truly-missing → 404, everything else → 500. The wrap text is changed from
|
||||
// "issuer not found" to "failed to get issuer" to match the semantic.
|
||||
func (s *IssuerService) TestConnection(ctx context.Context, id string) error {
|
||||
iss, err := s.issuerRepo.Get(ctx, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get issuer: %w", err)
|
||||
return fmt.Errorf("issuer not found: %w", err)
|
||||
}
|
||||
|
||||
// Get the decrypted config
|
||||
|
||||
+5
-24
@@ -3,6 +3,7 @@ package service
|
||||
import (
|
||||
"time"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
@@ -15,16 +16,8 @@ import (
|
||||
// approve a renewal job is the same person listed as the owner of the
|
||||
// underlying certificate. M-003 enforces separation of duties: the owner who
|
||||
// requested (or benefits from) the renewal must not be the same identity that
|
||||
// approves it.
|
||||
//
|
||||
// M-1 (P2): wraps [ErrForbidden] via fmt.Errorf("%w", ...) so the single
|
||||
// handler-layer errToStatus choke point resolves HTTP 403 via
|
||||
// errors.Is(err, ErrForbidden). Call sites can continue to
|
||||
// errors.Is(err, ErrSelfApproval) for domain-specific logging — both succeed
|
||||
// because %w builds a wrap chain. Pre-M-1, handler/jobs.go classified the
|
||||
// failure by substring-matching "cannot approve" in the message; a reword
|
||||
// would have silently demoted 403 to 500 with no observable signal.
|
||||
var ErrSelfApproval = fmt.Errorf("%w: self-approval forbidden: actor is the owner of the certificate", ErrForbidden)
|
||||
// approves it. Handlers map this sentinel to HTTP 403 Forbidden.
|
||||
var ErrSelfApproval = errors.New("self-approval forbidden: actor is the owner of the certificate")
|
||||
|
||||
// JobService manages job processing and status tracking.
|
||||
// It coordinates between the scheduler and various job-specific services.
|
||||
@@ -410,17 +403,9 @@ func (s *JobService) GetJob(ctx context.Context, id string) (*domain.Job, error)
|
||||
// duties. Callers must pass a non-empty actor; empty actor is treated as an
|
||||
// anonymous system caller and permitted (internal/system paths).
|
||||
func (s *JobService) ApproveJob(ctx context.Context, id, actor string) error {
|
||||
// M-1 (P2): the pre-M-1 wrap was `"job not found: %w"` on every jobRepo.Get
|
||||
// error — which coupled to the handler's strings.Contains substring classifier
|
||||
// on "not found" and gave false positives on transient DB failures, demoting a
|
||||
// 500 to a 404. Now the repo wraps only the genuine sql.ErrNoRows path with
|
||||
// repository.ErrNotFound (postgres/job.go Get), so errors.Is walks the wrap
|
||||
// chain correctly: truly-missing → 404, everything else → 500. The wrap text
|
||||
// is changed from "job not found" to "failed to get job" to match the
|
||||
// semantic.
|
||||
job, err := s.jobRepo.Get(ctx, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get job: %w", err)
|
||||
return fmt.Errorf("job not found: %w", err)
|
||||
}
|
||||
|
||||
if job.Status != domain.JobStatusAwaitingApproval {
|
||||
@@ -450,13 +435,9 @@ func (s *JobService) ApproveJob(ctx context.Context, id, actor string) error {
|
||||
// owner is permitted to cancel their own pending renewal. actor is recorded
|
||||
// on the log line for audit attribution.
|
||||
func (s *JobService) RejectJob(ctx context.Context, id, reason, actor string) error {
|
||||
// M-1 (P2): wrap-text correction — see ApproveJob for rationale. Same
|
||||
// repo.Get sql.ErrNoRows → repository.ErrNotFound propagation; text changed
|
||||
// from "job not found" to "failed to get job" so transient DB errors stay
|
||||
// 500 while genuine 404s are picked up via errors.Is at errToStatus.
|
||||
job, err := s.jobRepo.Get(ctx, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get job: %w", err)
|
||||
return fmt.Errorf("job not found: %w", err)
|
||||
}
|
||||
|
||||
if job.Status != domain.JobStatusAwaitingApproval {
|
||||
|
||||
@@ -383,15 +383,6 @@ func (s *NotificationService) ListNotifications(ctx context.Context, page, perPa
|
||||
}
|
||||
|
||||
// GetNotification returns a single notification (handler interface method).
|
||||
//
|
||||
// M-1 (P2): the not-found branch now wraps service.ErrNotFound via %w so the
|
||||
// handler's errToStatus choke point dispatches to 404 via errors.Is. Pre-M-1
|
||||
// this returned a raw fmt.Errorf("notification not found") and the handler
|
||||
// relied on a blanket "any error from this service call → 404" shortcut —
|
||||
// which silently demoted transient DB failures (the List() error wrap above)
|
||||
// to 404. Post-M-1 the handler uses errToStatus; the List() error wrap still
|
||||
// surfaces as 500 (no sentinel attached), and only this explicit not-found
|
||||
// branch maps to 404.
|
||||
func (s *NotificationService) GetNotification(ctx context.Context, id string) (*domain.NotificationEvent, error) {
|
||||
filter := &repository.NotificationFilter{
|
||||
PerPage: 1,
|
||||
@@ -409,7 +400,7 @@ func (s *NotificationService) GetNotification(ctx context.Context, id string) (*
|
||||
}
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("%w: notification %s", ErrNotFound, id)
|
||||
return nil, fmt.Errorf("notification not found")
|
||||
}
|
||||
|
||||
// MarkAsRead marks a notification as read (handler interface method).
|
||||
@@ -577,13 +568,6 @@ func (s *NotificationService) RetryFailedNotifications(ctx context.Context) erro
|
||||
// "pg: deadlock detected" surfaces in the handler response and the
|
||||
// operator UI. The service has no fallback — a silent "success" that
|
||||
// didn't actually mutate the row would be worse than a loud error.
|
||||
//
|
||||
// M-1 (P2): the repo's Requeue (postgres/notification.go) now wraps
|
||||
// repository.ErrNotFound on zero-rows-affected. Because this method wraps
|
||||
// the repo error with %w, errors.Is(err, repository.ErrNotFound) walks
|
||||
// cleanly through both wrap layers — so the handler's errToStatus choke
|
||||
// point dispatches a genuine missing-row Requeue to 404 and a transient
|
||||
// "pg: deadlock detected" to 500 without substring matching.
|
||||
func (s *NotificationService) RequeueNotification(ctx context.Context, id string) error {
|
||||
if err := s.notifRepo.Requeue(ctx, id); err != nil {
|
||||
return fmt.Errorf("failed to requeue notification: %w", err)
|
||||
|
||||
@@ -139,51 +139,42 @@ func (s *ProfileService) Get(ctx context.Context, id string) (*domain.Certificat
|
||||
}
|
||||
|
||||
// validateProfile checks that a profile's configuration is valid.
|
||||
//
|
||||
// M-1: every return here wraps ErrValidation via %w so the handler's
|
||||
// errToStatus choke point dispatches these to HTTP 400 via errors.Is without
|
||||
// the substring-matching on "invalid"/"required"/"must be"/"cannot" that the
|
||||
// pre-M-1 profile handler relied on. The composed Error() string still
|
||||
// contains the original human-readable text (e.g. "RSA minimum key size must
|
||||
// be at least 2048"), so the handler safely passes err.Error() through to the
|
||||
// response body on the 400 arm. Substring assertions in profile_test.go
|
||||
// continue to match for the same reason.
|
||||
func validateProfile(p *domain.CertificateProfile) error {
|
||||
if p.Name == "" {
|
||||
return fmt.Errorf("%w: profile name is required", ErrValidation)
|
||||
return fmt.Errorf("profile name is required")
|
||||
}
|
||||
if len(p.Name) > 255 {
|
||||
return fmt.Errorf("%w: profile name exceeds 255 characters", ErrValidation)
|
||||
return fmt.Errorf("profile name exceeds 255 characters")
|
||||
}
|
||||
|
||||
// Validate key algorithms
|
||||
for _, alg := range p.AllowedKeyAlgorithms {
|
||||
if !domain.ValidKeyAlgorithms[alg.Algorithm] {
|
||||
return fmt.Errorf("%w: invalid key algorithm: %s (allowed: RSA, ECDSA, Ed25519)", ErrValidation, alg.Algorithm)
|
||||
return fmt.Errorf("invalid key algorithm: %s (allowed: RSA, ECDSA, Ed25519)", alg.Algorithm)
|
||||
}
|
||||
if alg.Algorithm == domain.KeyAlgorithmRSA && alg.MinSize < 2048 {
|
||||
return fmt.Errorf("%w: RSA minimum key size must be at least 2048, got %d", ErrValidation, alg.MinSize)
|
||||
return fmt.Errorf("RSA minimum key size must be at least 2048, got %d", alg.MinSize)
|
||||
}
|
||||
if alg.Algorithm == domain.KeyAlgorithmECDSA && alg.MinSize < 256 {
|
||||
return fmt.Errorf("%w: ECDSA minimum key size must be at least 256, got %d", ErrValidation, alg.MinSize)
|
||||
return fmt.Errorf("ECDSA minimum key size must be at least 256, got %d", alg.MinSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Validate EKUs
|
||||
for _, eku := range p.AllowedEKUs {
|
||||
if !domain.ValidEKUs[eku] {
|
||||
return fmt.Errorf("%w: invalid EKU: %s", ErrValidation, eku)
|
||||
return fmt.Errorf("invalid EKU: %s", eku)
|
||||
}
|
||||
}
|
||||
|
||||
// Validate max TTL
|
||||
if p.MaxTTLSeconds < 0 {
|
||||
return fmt.Errorf("%w: max_ttl_seconds cannot be negative", ErrValidation)
|
||||
return fmt.Errorf("max_ttl_seconds cannot be negative")
|
||||
}
|
||||
|
||||
// Validate short-lived consistency
|
||||
if p.AllowShortLived && p.MaxTTLSeconds >= 3600 {
|
||||
return fmt.Errorf("%w: allow_short_lived is true but max_ttl_seconds (%d) is >= 3600; short-lived certs must have TTL under 1 hour", ErrValidation, p.MaxTTLSeconds)
|
||||
return fmt.Errorf("allow_short_lived is true but max_ttl_seconds (%d) is >= 3600; short-lived certs must have TTL under 1 hour", p.MaxTTLSeconds)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -84,22 +84,10 @@ func (s *SCEPService) PKCSReq(ctx context.Context, csrPEM string, challengePassw
|
||||
// this branch also protects future call sites (tests, library reuse, a
|
||||
// future REST-over-HTTPS wrapper) from silently accepting unauthenticated
|
||||
// CSRs.
|
||||
//
|
||||
// M-1 (P2): both failure modes now wrap service.ErrUnauthenticated via %w so
|
||||
// the handler's errToStatus choke point dispatches them to HTTP 401 via
|
||||
// errors.Is instead of relying on a `strings.Contains(err.Error(), "challenge
|
||||
// password")` substring branch at handler/scep.go:174. This is a deliberate
|
||||
// semantic correction: the pre-M-1 substring branch returned 403 Forbidden,
|
||||
// but SCEP challenge-password failure is an authentication failure (the
|
||||
// caller has no valid credential at the application layer), not an
|
||||
// authorization denial (the caller has a credential but is not permitted),
|
||||
// and 401 Unauthorized is the correct RFC 7235 status. The errToStatus doc
|
||||
// block explicitly cites this site as the canonical ErrUnauthenticated use
|
||||
// case.
|
||||
if s.challengePassword == "" {
|
||||
s.logger.Warn("SCEP enrollment rejected: server has no challenge password configured",
|
||||
"transaction_id", transactionID)
|
||||
return nil, fmt.Errorf("%w: SCEP challenge password not configured on server", ErrUnauthenticated)
|
||||
return nil, fmt.Errorf("SCEP challenge password not configured on server")
|
||||
}
|
||||
// Constant-time compare avoids leaking the configured secret through
|
||||
// response-time variance. ConstantTimeCompare returns 1 only when both
|
||||
@@ -108,7 +96,7 @@ func (s *SCEPService) PKCSReq(ctx context.Context, csrPEM string, challengePassw
|
||||
if subtle.ConstantTimeCompare([]byte(challengePassword), []byte(s.challengePassword)) != 1 {
|
||||
s.logger.Warn("SCEP enrollment rejected: invalid challenge password",
|
||||
"transaction_id", transactionID)
|
||||
return nil, fmt.Errorf("%w: invalid challenge password", ErrUnauthenticated)
|
||||
return nil, fmt.Errorf("invalid challenge password")
|
||||
}
|
||||
|
||||
return s.processEnrollment(ctx, csrPEM, transactionID, "scep_pkcsreq")
|
||||
|
||||
@@ -148,13 +148,6 @@ func TestSCEPService_PKCSReq_ChallengePassword_Invalid(t *testing.T) {
|
||||
if err == nil {
|
||||
t.Fatal("expected error for invalid challenge password")
|
||||
}
|
||||
// M-1 (P2): pin the sentinel wrap so the handler's errToStatus dispatch
|
||||
// (errors.Is → 401 Unauthorized) stays contract-stable. The parallel
|
||||
// substring check is retained because it also pins the user-facing reason
|
||||
// that service.ErrUnauthenticated composes with via fmt.Errorf("%w: ...").
|
||||
if !errors.Is(err, ErrUnauthenticated) {
|
||||
t.Errorf("expected errors.Is(err, ErrUnauthenticated), got: %v", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "challenge password") {
|
||||
t.Errorf("expected 'challenge password' in error, got: %v", err)
|
||||
}
|
||||
@@ -178,12 +171,6 @@ func TestSCEPService_PKCSReq_ChallengePassword_EmptyServerConfigRejected(t *test
|
||||
if err == nil {
|
||||
t.Fatalf("expected rejection when server challenge password is empty (client=%q)", clientPassword)
|
||||
}
|
||||
// M-1 (P2): pin the sentinel wrap on the "no shared secret configured"
|
||||
// branch so both H-2 failure modes route to the same 401 arm at the
|
||||
// handler. Preserves the explanatory substring assertion alongside.
|
||||
if !errors.Is(err, ErrUnauthenticated) {
|
||||
t.Errorf("expected errors.Is(err, ErrUnauthenticated) for client=%q, got: %v", clientPassword, err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "not configured") {
|
||||
t.Errorf("expected 'not configured' in error, got: %v", err)
|
||||
}
|
||||
@@ -206,12 +193,6 @@ func TestSCEPService_PKCSReq_ChallengePassword_ConstantTimeLengthIndependence(t
|
||||
if err == nil {
|
||||
t.Fatalf("expected rejection for bad password %q", bad)
|
||||
}
|
||||
// M-1 (P2): pin sentinel-wrap on every bad-password input so both the
|
||||
// length-mismatch and content-mismatch ConstantTimeCompare paths map to
|
||||
// the same 401 arm.
|
||||
if !errors.Is(err, ErrUnauthenticated) {
|
||||
t.Errorf("expected errors.Is(err, ErrUnauthenticated) for %q, got: %v", bad, err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "invalid challenge password") {
|
||||
t.Errorf("expected 'invalid challenge password' for %q, got: %v", bad, err)
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package service
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
@@ -17,15 +18,7 @@ import (
|
||||
// agent. The handler layer maps this to HTTP 400 via [errors.Is]. See C-002 in
|
||||
// cowork/certctl-coverage-gap-audit.md — this sentinel replaces a silent
|
||||
// Postgres FK violation (23503 → HTTP 500) with a deterministic 400.
|
||||
//
|
||||
// M-1 (P2): wraps [ErrValidation] via fmt.Errorf("%w", ...) so the single
|
||||
// handler-layer errToStatus choke point resolves HTTP 400 via
|
||||
// errors.Is(err, ErrValidation). "Referenced FK does not exist in POST body"
|
||||
// is invalid input — not a missing-resource lookup — which is why this wraps
|
||||
// ErrValidation (400) rather than ErrNotFound (404). Pre-M-1, target handlers
|
||||
// classified this via strings.Contains(err.Error(), "not found"); a message
|
||||
// reword would have silently promoted 400 to 500 with no observable signal.
|
||||
var ErrAgentNotFound = fmt.Errorf("%w: referenced agent does not exist", ErrValidation)
|
||||
var ErrAgentNotFound = errors.New("referenced agent does not exist")
|
||||
|
||||
// validTargetTypes is the set of allowed target types for validation.
|
||||
var validTargetTypes = map[domain.TargetType]bool{
|
||||
@@ -222,20 +215,10 @@ func (s *TargetService) Delete(ctx context.Context, id string, actor string) err
|
||||
// TestConnection tests a target's connectivity by checking the assigned agent's heartbeat status.
|
||||
// Target connectors run on agents, not on the server, so we can't instantiate a connector here.
|
||||
// Instead, we verify the agent is online and reachable.
|
||||
//
|
||||
// M-1 (P2): the pre-M-1 wraps were `"target not found: %w"` and
|
||||
// `"assigned agent not found: %w"` on every targetRepo.Get / agentRepo.Get error
|
||||
// — which coupled to pre-M-1 handler strings.Contains substring classifiers on
|
||||
// "not found" and gave false positives on transient DB failures. Now both
|
||||
// repos wrap only the genuine sql.ErrNoRows path with repository.ErrNotFound,
|
||||
// so errors.Is walks the chain correctly: truly-missing → 404, everything else
|
||||
// → 500. The wrap text is changed to "failed to get target" / "failed to get
|
||||
// agent" to match the semantic (agent-misconfigured stays 500 — "assigned
|
||||
// agent" is a data-integrity issue, not a consumer 404).
|
||||
func (s *TargetService) TestConnection(ctx context.Context, id string) error {
|
||||
target, err := s.targetRepo.Get(ctx, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get target: %w", err)
|
||||
return fmt.Errorf("target not found: %w", err)
|
||||
}
|
||||
|
||||
if target.AgentID == "" {
|
||||
@@ -246,7 +229,7 @@ func (s *TargetService) TestConnection(ctx context.Context, id string) error {
|
||||
agent, err := s.agentRepo.Get(ctx, target.AgentID)
|
||||
if err != nil {
|
||||
s.updateTestStatus(ctx, target, "failed")
|
||||
return fmt.Errorf("failed to get assigned agent: %w", err)
|
||||
return fmt.Errorf("assigned agent not found: %w", err)
|
||||
}
|
||||
|
||||
// I-004: AgentRepository.Get intentionally surfaces retired rows (the banner
|
||||
|
||||
@@ -51,6 +51,15 @@ export interface CertificateVersion {
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
// G-2 (P1): `api_key_hash` is intentionally absent from this interface.
|
||||
// The server-side struct (internal/domain/connector.go::Agent) carries
|
||||
// the field for the auth-lookup path but redacts it via a custom
|
||||
// MarshalJSON so it never reaches the JSON wire. Adding `api_key_hash`
|
||||
// here would not magically populate it on the wire — and would mislead
|
||||
// future contributors into thinking the field is part of the public
|
||||
// API contract. See docs/architecture.md ER-diagram note and
|
||||
// coverage-gap-audit-2026-04-24-v5/unified-audit.md cat-s5-apikey_leak
|
||||
// for the closure rationale.
|
||||
export interface Agent {
|
||||
id: string;
|
||||
name: string;
|
||||
|
||||
Reference in New Issue
Block a user