diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f5d5802..9faeae4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -213,6 +213,63 @@ jobs: exit 1 fi + - name: Forbidden migration mount in compose initdb (U-3) + # U-3 closed cat-u-seed_initdb_schema_drift (GitHub #10) by + # eliminating the dual-source-of-truth between + # `migrations/*.up.sql` mounted into postgres + # `/docker-entrypoint-initdb.d/` and the same files re-applied at + # runtime by `RunMigrations`. Pre-U-3 every new migration that + # the seed depended on (000013 added `policy_rules.severity`, + # 000017 renames `retry_interval_seconds`, etc.) had to be added + # by hand to the compose mount list; missing the update crashed + # initdb on first boot, postgres flagged unhealthy, and the + # whole stack failed to start from a fresh clone. Post-U-3 the + # server is the single source of truth — `RunMigrations` + + # `RunSeed` apply everything at boot. + # + # This step grep-fails the build if any compose file under + # `deploy/` re-introduces a `migrations/.*\.sql` mount into + # `/docker-entrypoint-initdb.d`. Comments are exempt so the + # post-fix rationale block in the compose files (which + # documents WHY the mounts were removed) doesn't trip the guard. + # The demo overlay's `seed_demo.sql` is the explicit exception: + # it is tolerated only when it lives behind the + # CERTCTL_DEMO_SEED env var (post-U-3 demo path) — bare initdb + # mounts are NOT tolerated. The grep matches all compose + # mount-list shapes (`-` indented, `volumes:` indented, both), + # so any future drift surfaces here before the operator hits it + # on a fresh clone. + # + # See coverage-gap-audit-2026-04-24-v5/unified-audit.md + # cat-u-seed_initdb_schema_drift for the closure rationale, or + # internal/repository/postgres/db.go::RunSeed for the runtime + # contract. + run: | + set -e + + BAD=$(grep -rnEH \ + -e 'migrations/.*\.sql:.*docker-entrypoint-initdb' \ + -e 'seed.*\.sql:.*docker-entrypoint-initdb' \ + deploy/docker-compose.yml \ + deploy/docker-compose.test.yml \ + deploy/docker-compose.demo.yml \ + 2>/dev/null \ + | grep -vE '^\s*[^:]+:[0-9]+:\s*#' \ + || true) + if [ -n "$BAD" ]; then + echo "U-3 regression: migration/seed mount into postgres initdb reappeared:" + echo "$BAD" + echo "" + echo "The post-U-3 contract is: postgres comes up with an empty" + echo "schema and the server applies migrations + seed at boot via" + echo "internal/repository/postgres.RunMigrations + RunSeed. Demo" + echo "data lives behind CERTCTL_DEMO_SEED=true (RunDemoSeed)," + echo "not an initdb mount. See" + echo "coverage-gap-audit-2026-04-24-v5/unified-audit.md" + echo "cat-u-seed_initdb_schema_drift for the closure rationale." + exit 1 + fi + - name: Race Detection run: go test -race ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/scheduler/... ./internal/connector/... ./internal/crypto/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -timeout 300s diff --git a/CHANGELOG.md b/CHANGELOG.md index 33c7a3f..9c44951 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,42 @@ All notable changes to certctl are documented in this file. Dates use ISO 8601. ## [unreleased] — 2026-04-24 +### U-3: GitHub #10 reopened — fresh-clone first-up postgres init failure (P1) — closed end-to-end + +> Operator `mikeakasully` cloned v2.0.50 fresh, ran the canonical quickstart `docker compose -f deploy/docker-compose.yml up -d --build`, and postgres reported `unhealthy` indefinitely; dependent containers (certctl-server, certctl-agent) never started. Root cause: the deploy compose stack mounted both a hand-curated subset of `migrations/*.up.sql` and `seed.sql` into postgres `/docker-entrypoint-initdb.d/`. Postgres applied them at initdb time. Once `seed.sql` referenced columns added by migrations *after* the mounted cutoff (e.g., `policy_rules.severity` from migration 000013, which the mount list never included), initdb crashed mid-seed and the container loop wedged. Two sources of truth — the mount list and the in-tree migration ladder — diverged the moment a seed-touching migration shipped, and the only thing that fixed it was hand-editing the compose file every release. The U-3 closure removes the dual source: postgres now boots empty and the server applies the entire migration ladder + seed at startup via `RunMigrations` + `RunSeed`. Same pattern Helm has used since day one. Bundled with four ride-along audit findings whose fixes are in adjacent code (column rename, missing column, dropped orphan columns, new build-identity endpoint) so operators take the schema-change pain only once. + +### Breaking Changes + +- **`deploy/docker-compose.yml` postgres no longer initdb-mounts the migration files or `seed.sql`.** Operators running on a populated `postgres_data` volume from a pre-U-3 release see no behavioral change (the schema is already in place; `RunMigrations` is `IF NOT EXISTS` and `RunSeed` is `ON CONFLICT DO NOTHING`). Operators running on a *fresh* clone now rely on the server to apply both — which is the bug fix. There is no rollback path other than re-introducing the dual-source-of-truth hazard. See `internal/repository/postgres/db.go::RunSeed` for the runtime contract. +- **`migrations/000017_db_coupling_cleanup.up.sql` renames `renewal_policies.retry_interval_minutes` → `retry_interval_seconds`.** The column always held seconds; the column name lied (`cat-o-retry_interval_unit_mismatch`). Operators running raw SQL against the old name need to update their queries. The Go layer (`internal/repository/postgres/renewal_policy.go`) is updated in lockstep so the in-tree code path is unaffected. +- **`migrations/000017_db_coupling_cleanup.up.sql` drops `network_scan_targets.health_check_enabled` and `network_scan_targets.health_check_interval_seconds`.** These columns were declared by a long-ago migration but never wired into Go code (`cat-o-health_check_column_orphans`) — schema noise that confused operators reading raw SQL. Anyone with custom dashboards selecting those columns will break. +- **The compose demo overlay (`deploy/docker-compose.demo.yml`) no longer initdb-mounts `seed_demo.sql`.** It now sets `CERTCTL_DEMO_SEED=true` and the server applies the demo seed at boot via `RunDemoSeed` after baseline migrations + seed.sql are in place. Same single-source-of-truth pattern as the production path. + +### Added + +- **Migration `000017_db_coupling_cleanup`** (up + down). Bundles three schema changes in idempotent SQL: (1) rename `renewal_policies.retry_interval_minutes` → `retry_interval_seconds` (DO $$ guard so re-application is safe), (2) add `notification_events.created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()`, (3) drop the orphan `network_scan_targets.health_check_*` columns. Reduces operator-visible "schema-change releases" from four to one. +- **`internal/repository/postgres.RunSeed`** — runtime equivalent of the deleted initdb mount for `seed.sql`. Called from `cmd/server/main.go` immediately after `RunMigrations`. Idempotent (every INSERT in the shipped seed uses `ON CONFLICT (id) DO NOTHING`); missing-file is a no-op so operators with custom packaging that strips the seed don't break. +- **`internal/repository/postgres.RunDemoSeed`** + **`config.DatabaseConfig.DemoSeed`** + **`CERTCTL_DEMO_SEED` env var.** Replaces the deleted `seed_demo.sql` initdb mount. The compose demo overlay sets `CERTCTL_DEMO_SEED=true` and the server applies the demo seed after baseline. Same idempotency contract as the baseline path. Default-off so a vanilla deploy never lands fake-history rows. +- **`GET /api/v1/version` endpoint** + **`internal/api/handler.VersionHandler`**. Returns `{version, commit, modified, build_time, go_version}` from `runtime/debug.ReadBuildInfo()` with ldflags-supplied `Version` taking priority. Wired through the no-auth dispatch in `cmd/server/main.go` so probes and rollout systems can read build identity without Bearer credentials. Audit middleware excludes the path so rollout polls don't dominate the audit trail. Closes `cat-u-no_version_endpoint`. +- **`notification_events.created_at` column** is now populated by `NotificationRepository.Create` (with a `time.Now()` fallback when the caller leaves it zero) and read back by `scanNotification`. Pre-U-3 the JSON API serialised `0001-01-01T00:00:00Z` — closes `cat-o-notification_created_at_dead_field`. +- **Five regression tests** for the U-3 contract: `TestRunSeed_AppliesIdempotently`, `TestRunSeed_MissingFileIsNoOp`, `TestRunDemoSeed_AppliesIdempotently`, `TestMigration000017_RetryIntervalRename`, `TestMigration000017_NotificationCreatedAt`, `TestMigration000017_HealthCheckOrphansDropped`, plus `TestNotificationRepository_CreatedAt_IsPersisted` / `TestNotificationRepository_CreatedAt_DefaultsToNow` for the round-trip. All testcontainers-gated (skipped under `-short`). Three handler-layer unit tests pin `/api/v1/version` (`TestVersion_ReturnsBuildInfo`, `TestVersion_RejectsNonGet`, `TestVersion_LdflagsOverride`). +- **CI regression guardrail** in `.github/workflows/ci.yml` (`Forbidden migration mount in compose initdb (U-3)`) — grep-fails the build if any `migrations/.*\.sql` or `seed.*\.sql` file is re-mounted into `/docker-entrypoint-initdb.d` in any compose file. Catches future drift before a fresh-clone operator hits it. + +### Changed + +- **`deploy/docker-compose.yml`** + **`deploy/docker-compose.test.yml`** — postgres `volumes:` no longer mount migrations or seed files; postgres healthcheck gains `start_period: 30s`; certctl-server healthcheck gains `start_period: 30s` to absorb the runtime migration + seed application window on first boot. +- **`deploy/docker-compose.demo.yml`** — replaces the `seed_demo.sql` initdb mount with the `CERTCTL_DEMO_SEED=true` env var on `certctl-server`. +- **`migrations/seed.sql`** — `INSERT INTO renewal_policies` updated to use the new `retry_interval_seconds` column name (lockstep with migration 000017). +- **`internal/repository/postgres/renewal_policy.go`** — column references updated to `retry_interval_seconds` across SELECT, INSERT, and UPDATE sites (lockstep with migration 000017). + +### Closed audit findings + +- `cat-u-seed_initdb_schema_drift` (P1, primary U-3 finding) +- `cat-o-retry_interval_unit_mismatch` (P1) +- `cat-o-notification_created_at_dead_field` (P2) +- `cat-o-health_check_column_orphans` (P1) +- `cat-u-no_version_endpoint` (P2) + ### G-1: JWT silent auth downgrade — closed end-to-end > Pre-G-1 the config validator accepted `CERTCTL_AUTH_TYPE=jwt` and the startup log faithfully echoed `"authentication enabled" "type"="jwt"`. Reasonable people read that and concluded JWT was on. It wasn't. The auth-middleware wiring at `cmd/server/main.go` unconditionally routed every request through the api-key bearer middleware regardless of `cfg.Auth.Type`. So `CERTCTL_AUTH_TYPE=jwt` quietly compared incoming `Authorization: Bearer ` against whatever string the operator put in `CERTCTL_AUTH_SECRET` — real JWT clients got 401, and operators who treated `CERTCTL_AUTH_SECRET` as a *signing* secret (because they thought they were configuring JWT) had effectively handed an attacker an api-key. A security finding masquerading as a config option. We chose to remove the option rather than ship JWT middleware — the audit-recommended structural fix that closes the hazard. Operators who actually need JWT/OIDC front certctl with an authenticating gateway (oauth2-proxy / Envoy `ext_authz` / Traefik `ForwardAuth` / Pomerium / Authelia) and run the upstream certctl with `CERTCTL_AUTH_TYPE=none`. The same pattern works on docker-compose and Helm. diff --git a/api/openapi.yaml b/api/openapi.yaml index 6c4eaed..51653d0 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -163,6 +163,50 @@ paths: "401": description: Unauthorized + /api/v1/version: + get: + tags: [Health] + summary: Build identity (version, commit, Go runtime) + description: | + Returns the running server's build identity. Served without + auth so rollout systems and blackbox probes can read it without + Bearer credentials. U-3 ride-along (cat-u-no_version_endpoint). + Excluded from audit logging because rollout polling would + otherwise dominate the audit trail. + + The Version field follows a fallback ladder: ldflags-supplied + value > VCS commit SHA > "dev". Commit / Modified / BuildTime + come from runtime/debug.BuildInfo (Go 1.18+ stamps these on + every module-tracked build). GoVersion is runtime.Version(). + security: [] + operationId: getVersion + responses: + "200": + description: Build identity + content: + application/json: + schema: + type: object + required: [version, commit, modified, build_time, go_version] + properties: + version: + type: string + description: Release tag (ldflags-supplied) or VCS SHA fallback or "dev" + example: v2.0.51 + commit: + type: string + description: Git SHA from runtime/debug.BuildInfo (vcs.revision); empty when not VCS-tracked + modified: + type: boolean + description: True when build had uncommitted changes (vcs.modified) + build_time: + type: string + description: RFC 3339 build timestamp (vcs.time); empty when not VCS-tracked + go_version: + type: string + description: Go toolchain version that compiled the binary (runtime.Version()) + example: go1.25.9 + # ─── Certificates ──────────────────────────────────────────────────── /api/v1/certificates: get: diff --git a/cmd/server/main.go b/cmd/server/main.go index 84dadb1..77dbd5b 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -86,6 +86,41 @@ func main() { } logger.Info("migrations completed") + // Apply baseline seed data. + // + // U-3 (P1, cat-u-seed_initdb_schema_drift): pre-U-3 seed.sql was mounted + // into postgres `/docker-entrypoint-initdb.d/` alongside a hand-curated + // subset of migrations. Adding a migration that introduced a new column + // referenced by seed.sql (cat-o-retry_interval_unit_mismatch / + // policy_rules.severity / etc.) without also updating the compose volume + // mounts caused initdb to crash on first up. Post-U-3 the compose stack + // drops all initdb mounts; postgres comes up with empty schema, the + // server runs RunMigrations above, then this RunSeed call lands the + // baseline data — all from a single source of truth (this binary). + // See internal/repository/postgres/db.go::RunSeed for the contract. + logger.Info("applying baseline seed", "path", cfg.Database.MigrationsPath) + if err := postgres.RunSeed(db, cfg.Database.MigrationsPath); err != nil { + logger.Error("failed to apply seed data", "error", err) + os.Exit(1) + } + logger.Info("seed completed") + + // Apply demo overlay seed when CERTCTL_DEMO_SEED=true. Pre-U-3 the demo + // overlay (deploy/docker-compose.demo.yml) mounted seed_demo.sql into + // postgres `/docker-entrypoint-initdb.d/`; that broke once U-3 dropped + // the initdb migration mounts (the demo seed references tables that + // wouldn't exist at initdb time). The runtime path here is the + // post-U-3 replacement. Default-off so a vanilla deploy never lands + // fake-history rows. See postgres.RunDemoSeed for the contract. + if cfg.Database.DemoSeed { + logger.Info("applying demo seed (CERTCTL_DEMO_SEED=true)", "path", cfg.Database.MigrationsPath) + if err := postgres.RunDemoSeed(db, cfg.Database.MigrationsPath); err != nil { + logger.Error("failed to apply demo seed data", "error", err) + os.Exit(1) + } + logger.Info("demo seed completed") + } + // Initialize repositories with real PostgreSQL connection auditRepo := postgres.NewAuditRepository(db) certificateRepo := postgres.NewCertificateRepository(db) @@ -406,6 +441,13 @@ func main() { statsHandler := handler.NewStatsHandler(statsService) metricsHandler := handler.NewMetricsHandler(statsService, time.Now()) healthHandler := handler.NewHealthHandler(cfg.Auth.Type) + // U-3 ride-along (cat-u-no_version_endpoint, P2): the version handler + // answers GET /api/v1/version with build identity (ldflags Version, + // VCS commit/dirty/timestamp, Go runtime version). Wired through the + // no-auth dispatch + audit ExcludePaths below so probes and rollout + // systems can read it without Bearer credentials and without flooding + // the audit trail. + versionHandler := handler.NewVersionHandler() discoveryHandler := handler.NewDiscoveryHandler(discoveryService) networkScanHandler := handler.NewNetworkScanHandler(networkScanService) verificationService := service.NewVerificationService(jobRepo, auditService, logger) @@ -554,6 +596,7 @@ func main() { Digest: *digestHandler, HealthChecks: healthCheckHandler, BulkRevocation: bulkRevocationHandler, + Version: versionHandler, }) // Register EST (RFC 7030) handlers if enabled if cfg.EST.Enabled { @@ -690,10 +733,15 @@ func main() { }, ) auditMiddleware := middleware.NewAuditLog(auditAdapter, middleware.AuditConfig{ - ExcludePaths: []string{"/health", "/ready"}, + // /api/v1/version is excluded for the same reason /health and /ready + // are: rollout systems and blackbox probes hammer it on a tight + // interval, and the audit trail's value comes from rare, + // operator-authored mutations — not from sub-second readonly polls. + // U-3 ride-along (cat-u-no_version_endpoint, P2). + ExcludePaths: []string{"/health", "/ready", "/api/v1/version"}, Logger: logger, }) - logger.Info("API audit logging enabled (excluding /health, /ready)") + logger.Info("API audit logging enabled (excluding /health, /ready, /api/v1/version)") middlewareStack := []func(http.Handler) http.Handler{ middleware.RequestID, @@ -889,6 +937,7 @@ func preflightSCEPChallengePassword(enabled bool, challengePassword string) erro // Dispatch rules (M-001, audit 2026-04-19, option D): // // - /health, /ready, /api/v1/auth/info → no-auth (probes + login detection) +// - /api/v1/version → no-auth (U-3 ride-along: build identity for rollout/probes) // - /.well-known/pki/* → no-auth (RFC 5280 CRL, RFC 6960 OCSP) // - /.well-known/est/* → no-auth (RFC 7030 §3.2.3) // - /scep, /scep/* → no-auth (RFC 8894 §3.2, CSR challengePassword) @@ -914,10 +963,12 @@ func buildFinalHandler(apiHandler, noAuthHandler http.Handler, webDir string, da return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { path := r.URL.Path - // Health/ready and auth/info bypass auth middleware. + // Health/ready, auth/info, and version bypass auth middleware. // Health/ready: Docker/K8s health probes don't carry Bearer tokens. // auth/info: React app calls this before login to detect auth mode. - if path == "/health" || path == "/ready" || path == "/api/v1/auth/info" { + // version: U-3 ride-along (cat-u-no_version_endpoint) — rollout + // systems and blackbox probes need build identity without a key. + if path == "/health" || path == "/ready" || path == "/api/v1/auth/info" || path == "/api/v1/version" { noAuthHandler.ServeHTTP(w, r) return } diff --git a/deploy/docker-compose.demo.yml b/deploy/docker-compose.demo.yml index 22d0aa4..c1b3dd7 100644 --- a/deploy/docker-compose.demo.yml +++ b/deploy/docker-compose.demo.yml @@ -7,8 +7,20 @@ # To start fresh (wipe previous data): # docker compose -f docker-compose.yml -f docker-compose.demo.yml down -v # docker compose -f docker-compose.yml -f docker-compose.demo.yml up --build - +# +# U-3 (P1, cat-u-seed_initdb_schema_drift): pre-U-3 this overlay mounted +# `seed_demo.sql` into postgres `/docker-entrypoint-initdb.d/`. That worked +# only because the production stack also mounted the migrations there, so +# the schema existed at initdb time. Once U-3 dropped the production +# initdb mounts (single source of truth: server runs RunMigrations + RunSeed +# at boot), the demo seed could no longer be applied at initdb time — the +# tables it references wouldn't exist yet. +# +# Post-U-3 the demo overlay just sets CERTCTL_DEMO_SEED=true; the server +# applies seed_demo.sql at boot via postgres.RunDemoSeed AFTER baseline +# migrations + seed.sql are in place. Same single source of truth, no +# initdb mounts, no schema-vs-seed drift. services: - postgres: - volumes: - - ../migrations/seed_demo.sql:/docker-entrypoint-initdb.d/030_seed_demo.sql + certctl-server: + environment: + CERTCTL_DEMO_SEED: "true" diff --git a/deploy/docker-compose.test.yml b/deploy/docker-compose.test.yml index 07e2f3a..6bf869e 100644 --- a/deploy/docker-compose.test.yml +++ b/deploy/docker-compose.test.yml @@ -93,6 +93,17 @@ services: # --------------------------------------------------------------------------- # Database # --------------------------------------------------------------------------- + # + # U-3 (P1, cat-u-seed_initdb_schema_drift, GitHub #10): the test stack used + # to mount a hand-curated subset of migrations + seed.sql + a never-checked-in + # seed_test.sql into postgres `/docker-entrypoint-initdb.d/`. Same hazard as + # the production compose — initdb crashed any time a new migration shipped + # that the seed depended on without the mount list being updated. Post-U-3 + # the schema is built EXCLUSIVELY by the server at startup via + # internal/repository/postgres.RunMigrations + RunSeed. Postgres comes up + # empty and the server lands the full ladder + baseline seed in one shot. + # `start_period: 30s` matches the production compose and shields slow CI + # runners from healthcheck flap during initdb. postgres: image: postgres:16-alpine container_name: certctl-test-postgres @@ -102,19 +113,6 @@ services: POSTGRES_PASSWORD: testpass volumes: - test_postgres_data:/var/lib/postgresql/data - - ../migrations/000001_initial_schema.up.sql:/docker-entrypoint-initdb.d/001_schema.sql - - ../migrations/000002_agent_metadata.up.sql:/docker-entrypoint-initdb.d/002_agent_metadata.sql - - ../migrations/000003_certificate_profiles.up.sql:/docker-entrypoint-initdb.d/003_certificate_profiles.sql - - ../migrations/000004_agent_groups.up.sql:/docker-entrypoint-initdb.d/004_agent_groups.sql - - ../migrations/000005_revocation.up.sql:/docker-entrypoint-initdb.d/005_revocation.sql - - ../migrations/000006_discovery.up.sql:/docker-entrypoint-initdb.d/006_discovery.sql - - ../migrations/000007_network_discovery.up.sql:/docker-entrypoint-initdb.d/007_network_discovery.sql - - ../migrations/000008_verification.up.sql:/docker-entrypoint-initdb.d/008_verification.sql - - ../migrations/000009_issuer_config.up.sql:/docker-entrypoint-initdb.d/009_issuer_config.sql - - ../migrations/000010_target_config.up.sql:/docker-entrypoint-initdb.d/010_target_config.sql - - ../migrations/seed.sql:/docker-entrypoint-initdb.d/020_seed.sql - - ../migrations/seed_test.sql:/docker-entrypoint-initdb.d/025_seed_test.sql - # No seed_demo.sql — start with a clean database for real testing networks: certctl-test: ipv4_address: 10.30.50.2 @@ -125,6 +123,7 @@ services: interval: 5s timeout: 5s retries: 5 + start_period: 30s restart: unless-stopped # --------------------------------------------------------------------------- diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 59dc5da..fba43d4 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -53,6 +53,29 @@ services: - certctl-network # PostgreSQL database + # + # U-3 (P1, cat-u-seed_initdb_schema_drift, GitHub #10): + # Pre-U-3 this stack mounted a hand-curated subset of `migrations/*.up.sql` + # plus `seed.sql` into `/docker-entrypoint-initdb.d/`, and postgres + # initdb-applied them on first boot. The mount list rotted every time a + # new migration shipped that the seed depended on (000013 added + # policy_rules.severity, 000017 renames retry_interval_minutes, etc.) — + # initdb crashed, the container reported `unhealthy` indefinitely, and + # `docker compose -f deploy/docker-compose.yml up -d --build` from a + # fresh clone of v2.0.50 hit it on the first try. + # + # Post-U-3 the schema is built EXCLUSIVELY by the server at startup via + # internal/repository/postgres.RunMigrations + RunSeed. Single source of + # truth, no list to keep in sync. Postgres comes up empty; the server + # waits for it healthy, then applies the full migration ladder + seed in + # one shot. Helm + the dev examples were already runtime-only (Path B) + # and worked through the same window. + # + # `start_period: 30s` gives postgres room to bootstrap on slow runners + # (CI macOS, low-spec laptops) before the healthcheck failure counter + # starts ticking. Pre-U-3 a slow first-init combined with the + # `unhealthy` flap to cascade into certctl-server's `service_healthy` + # depends_on, blocking the whole stack. postgres: image: postgres:16-alpine container_name: certctl-postgres @@ -64,17 +87,6 @@ services: - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data - - ../migrations/000001_initial_schema.up.sql:/docker-entrypoint-initdb.d/001_schema.sql - - ../migrations/000002_agent_metadata.up.sql:/docker-entrypoint-initdb.d/002_agent_metadata.sql - - ../migrations/000003_certificate_profiles.up.sql:/docker-entrypoint-initdb.d/003_certificate_profiles.sql - - ../migrations/000004_agent_groups.up.sql:/docker-entrypoint-initdb.d/004_agent_groups.sql - - ../migrations/000005_revocation.up.sql:/docker-entrypoint-initdb.d/005_revocation.sql - - ../migrations/000006_discovery.up.sql:/docker-entrypoint-initdb.d/006_discovery.sql - - ../migrations/000007_network_discovery.up.sql:/docker-entrypoint-initdb.d/007_network_discovery.sql - - ../migrations/000008_verification.up.sql:/docker-entrypoint-initdb.d/008_verification.sql - - ../migrations/000009_issuer_config.up.sql:/docker-entrypoint-initdb.d/009_issuer_config.sql - - ../migrations/000010_target_config.up.sql:/docker-entrypoint-initdb.d/010_target_config.sql - - ../migrations/seed.sql:/docker-entrypoint-initdb.d/020_seed.sql networks: - certctl-network healthcheck: @@ -82,6 +94,7 @@ services: interval: 5s timeout: 5s retries: 5 + start_period: 30s restart: unless-stopped # Certctl Server (API + scheduler) @@ -127,6 +140,11 @@ services: interval: 10s timeout: 5s retries: 5 + # U-3: server boot now does RunMigrations + RunSeed before listening on + # 8443. On a fresh clone the full migration ladder + seed application + # can take ~10s on a small VM; start_period prevents the first few + # healthcheck attempts from counting as failures while that work runs. + start_period: 30s restart: unless-stopped logging: driver: "json-file" diff --git a/docs/architecture.md b/docs/architecture.md index c07d5a8..3d09cfe 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -353,7 +353,7 @@ The ER diagram above documents **database shape**, not REST-API wire shape. Seve - `agents.api_key_hash` — SHA-256 of the agent's plaintext API key, populated by `service.RegisterAgent` (`hashAPIKey(apiKey)` at `internal/service/agent.go`) and consumed by `repository.AgentRepository::GetByAPIKey` for the auth-lookup. **Not** exposed via the REST API, **not** echoed via CLI / MCP / agent registration response, **never** logged. Enforced by `internal/domain/connector.go::Agent.MarshalJSON` (G-2 audit closure, `cat-s5-apikey_leak`); the OpenAPI Agent schema explicitly excludes the field, the frontend `Agent` interface omits it, and a CI grep guardrail at `.github/workflows/ci.yml` blocks reintroduction. - `issuers.config` / `deployment_targets.config` — plaintext jsonb shadow of the AES-GCM-encrypted on-disk blob; the encrypted form lives on `EncryptedConfig []byte` (Go-only field tagged `json:"-"`). -Migrations are idempotent (`IF NOT EXISTS` on all CREATE statements, `ON CONFLICT (id) DO NOTHING` on all seed data) so they're safe to run multiple times — important for Docker Compose where both initdb and the server may run the same SQL. +Migrations are idempotent (`IF NOT EXISTS` on all CREATE statements, `ON CONFLICT (id) DO NOTHING` on all seed data) so they're safe to run multiple times. Pre-U-3 (`cat-u-seed_initdb_schema_drift`, GitHub #10) the deploy compose stack mounted both a hand-curated subset of `migrations/*.up.sql` and `seed.sql` into postgres `/docker-entrypoint-initdb.d/` so initdb applied them on first boot, *and* the server re-applied the same files via `RunMigrations` on every start. The dual source of truth was the bug: every time a migration shipped that the seed depended on (e.g., 000013 added `policy_rules.severity`), the mount list had to be updated by hand, and missing the update crashed initdb on first boot. Post-U-3 the server is the single source of truth: postgres comes up with an empty schema, `RunMigrations` applies the entire ladder, then `RunSeed` lands the baseline seed (and `RunDemoSeed` lands the demo overlay when `CERTCTL_DEMO_SEED=true`). Helm has used this pattern since day one (postgres-init `emptyDir`); the docker-compose deploy now matches. ## Data Flow: Certificate Lifecycle diff --git a/internal/api/handler/version.go b/internal/api/handler/version.go new file mode 100644 index 0000000..5e0ddef --- /dev/null +++ b/internal/api/handler/version.go @@ -0,0 +1,158 @@ +package handler + +import ( + "net/http" + "runtime" + "runtime/debug" +) + +// VersionHandler exposes the running server's build identity at +// /api/v1/version. U-3 ride-along (cat-u-no_version_endpoint, P2): pre-U-3 +// there was no in-band way for an operator (or an automated rollout system) +// to ask "what version of certctl is this binary?" — they had to either read +// the container image tag externally or trust whatever the README said. The +// gap matters for the same operability story U-3 closes: when fresh-clone +// quickstarts fail, the very first question is "what code did I actually +// build", and the only honest answer needs to come from the binary itself. +// +// VersionInfo is populated from three sources, in priority order: +// +// 1. The Version field — typically supplied at build time via +// `-ldflags='-X github.com/shankar0123/certctl/internal/api/handler.Version=v2.0.50'`. +// Production releases set this from the git tag (see release.yml). +// +// 2. runtime/debug.ReadBuildInfo() — populated by Go 1.18+ for any binary +// built from a module. Provides the VCS commit SHA, dirty flag, and +// build timestamp. We read these fields directly so a `go build` from a +// working tree (no -ldflags incantation) still produces a useful +// /api/v1/version payload — the failure mode pre-U-3 was that everything +// looked like "dev" everywhere, which made "is the bug fixed in this +// binary" unanswerable. +// +// 3. Static fallbacks ("dev" / "unknown") — only reached when neither +// ldflags nor build-info are populated, which in practice means +// `go run` from a non-VCS-tracked workspace. +// +// The handler runs through the no-auth bypass dispatch in cmd/server/main.go +// so probes and rollout systems can query it without presenting Bearer +// credentials, mirroring how /health and /ready are reachable. Audit logging +// excludes /api/v1/version for the same reason — the path is hot under +// rollout polling and would otherwise dominate the audit trail. +type VersionHandler struct{} + +// Version is overridden at build time via: +// +// -ldflags='-X github.com/shankar0123/certctl/internal/api/handler.Version=' +// +// release.yml does this for the server container and CLI/agent binaries. +// The empty default (rather than "dev") lets the Handler fall back to the +// runtime/debug VCS revision when ldflags wasn't supplied — preferable to +// returning a literal "dev" that masks the actual git SHA the binary was +// built from. +var Version = "" + +// NewVersionHandler returns a value (not a pointer) to match the +// HealthHandler convention — the handler holds no mutable state and is +// safe to copy. +func NewVersionHandler() VersionHandler { + return VersionHandler{} +} + +// VersionInfo is the JSON shape returned by GET /api/v1/version. +// +// Field ordering and tag names are part of the contract — operator tooling +// (k8s rollout checks, CI smoke tests, /api/v1/version Prometheus blackbox +// probes) parses this payload and must continue to work across releases. +// Don't rename a field without an OpenAPI bump and a deprecation cycle. +type VersionInfo struct { + // Version is the human-readable release identifier (e.g. "v2.0.50"). + // Falls back to the VCS revision when ldflags wasn't set, and to "dev" + // when the build wasn't VCS-tracked at all. + Version string `json:"version"` + + // Commit is the git SHA of HEAD at build time, sourced from + // runtime/debug.BuildInfo.Settings["vcs.revision"]. Empty string when + // the binary was built outside a VCS-tracked workspace (rare — + // `go build` from a tarball does this). + Commit string `json:"commit"` + + // Modified reports whether the build had uncommitted changes + // (debug.BuildInfo.Settings["vcs.modified"]). True for developer + // builds, false for release builds out of CI. + Modified bool `json:"modified"` + + // BuildTime is the RFC 3339 timestamp captured at build time + // (debug.BuildInfo.Settings["vcs.time"]). Empty when not VCS-tracked. + BuildTime string `json:"build_time"` + + // GoVersion is the Go toolchain version that compiled the binary + // (runtime.Version, e.g. "go1.25.9"). Useful when triaging stdlib + // behavior differences ("the deploy that broke was on 1.24, this one + // is on 1.25"). + GoVersion string `json:"go_version"` +} + +// readBuildInfo extracts the VCS settings from debug.BuildInfo and pairs +// them with the ldflags-supplied Version. Split out from ServeHTTP so the +// handler can be unit-tested by injecting synthetic BuildInfo (see +// version_handler_test.go) without depending on the test binary's actual +// debug info. +// +// debug.ReadBuildInfo returns ok=false when the binary was built without +// module info — extremely rare for a Go 1.18+ build, but we guard it so +// the handler degrades to "dev / unknown / runtime.Version()" instead of +// nil-deref panicking. +func readBuildInfo() VersionInfo { + info := VersionInfo{ + Version: Version, + GoVersion: runtime.Version(), + } + + bi, ok := debug.ReadBuildInfo() + if !ok { + // Pre-Go 1.18 binary or a stripped build with no buildinfo segment. + // Both are pathological in 2026 but worth the two-line guard. + if info.Version == "" { + info.Version = "dev" + } + return info + } + + for _, s := range bi.Settings { + switch s.Key { + case "vcs.revision": + info.Commit = s.Value + case "vcs.modified": + // debug.BuildInfo encodes this as the literal string "true" or + // "false"; comparing to "true" is the canonical pattern (mirrors + // how the standard library's own version sub-command parses it). + info.Modified = s.Value == "true" + case "vcs.time": + info.BuildTime = s.Value + } + } + + // Fallback ladder for Version: ldflags > VCS commit > "dev". The git + // SHA is more useful than "dev" because it's at least groundable — an + // operator can `git show ` to see what code is actually running. + if info.Version == "" { + if info.Commit != "" { + info.Version = info.Commit + } else { + info.Version = "dev" + } + } + + return info +} + +// ServeHTTP implements http.Handler. Returns the VersionInfo payload as +// JSON with a 200 status. GET-only — any other method returns 405, matching +// the HealthHandler convention. +func (h VersionHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + JSON(w, http.StatusOK, readBuildInfo()) +} diff --git a/internal/api/handler/version_handler_test.go b/internal/api/handler/version_handler_test.go new file mode 100644 index 0000000..4d282b6 --- /dev/null +++ b/internal/api/handler/version_handler_test.go @@ -0,0 +1,108 @@ +package handler + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "runtime" + "strings" + "testing" +) + +// TestVersion_ReturnsBuildInfo is the regression for the U-3 ride-along +// cat-u-no_version_endpoint (P2). Three behaviors must hold for the +// endpoint to be useful in operator tooling: +// +// 1. GET /api/v1/version returns 200 with a JSON body that decodes into +// the documented VersionInfo shape — the wire contract that rollout +// systems and Prometheus blackbox probes parse. +// 2. The Go runtime version always populates (runtime.Version() can never +// return empty), so consumers can always answer "which Go did this +// binary compile with" even when ldflags / VCS info are missing. +// 3. The Version field is never empty — the fallback ladder +// (ldflags > VCS commit > "dev") guarantees a non-empty string so +// consumers don't have to special-case absent values. +// +// We don't pin the exact Version value because it depends on whether the +// test binary was built with -ldflags or under `go test`, both of which +// the handler must tolerate. The "no empty string" check is the +// behavioral contract. +func TestVersion_ReturnsBuildInfo(t *testing.T) { + h := NewVersionHandler() + + req := httptest.NewRequest(http.MethodGet, "/api/v1/version", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want 200", rec.Code) + } + + contentType := rec.Header().Get("Content-Type") + if !strings.HasPrefix(contentType, "application/json") { + t.Errorf("Content-Type = %q, want application/json prefix (operator tooling parses JSON)", contentType) + } + + var got VersionInfo + if err := json.NewDecoder(rec.Body).Decode(&got); err != nil { + t.Fatalf("response body did not decode into VersionInfo: %v\nbody: %s", err, rec.Body.String()) + } + + // Version must never be empty — the fallback ladder in readBuildInfo + // guarantees this. An empty Version would force every downstream + // consumer (k8s rollouts, Prometheus blackbox, the support tooling) + // to special-case the missing value, which defeats the point of + // /api/v1/version existing. + if got.Version == "" { + t.Error("Version is empty — the fallback ladder (ldflags > VCS commit > 'dev') must guarantee a non-empty value") + } + + // GoVersion must equal runtime.Version() — the handler reads it + // directly and cannot be subverted by ldflags or BuildInfo. This is + // the one field that should always be ground-truth. + if got.GoVersion != runtime.Version() { + t.Errorf("GoVersion = %q, want %q (must come straight from runtime.Version())", + got.GoVersion, runtime.Version()) + } +} + +// TestVersion_RejectsNonGet pins the GET-only contract. /api/v1/version +// is read-only build identity; POST/PUT/DELETE etc. are nonsensical and +// should return 405 like the HealthHandler does. Operator tooling that +// fat-fingers the verb gets a clear error rather than a confusing 200 +// from the wrong code path. +func TestVersion_RejectsNonGet(t *testing.T) { + h := NewVersionHandler() + + for _, method := range []string{ + http.MethodPost, http.MethodPut, http.MethodDelete, http.MethodPatch, + } { + req := httptest.NewRequest(method, "/api/v1/version", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusMethodNotAllowed { + t.Errorf("%s /api/v1/version → status %d, want 405", method, rec.Code) + } + } +} + +// TestVersion_LdflagsOverride locks in the priority order: when the +// build-time Version variable is non-empty (e.g. "v2.0.50" injected by +// release.yml), readBuildInfo MUST surface that value verbatim and not +// silently substitute the VCS commit. The release-pipeline contract +// depends on this — a release tagged v2.0.50 should report "v2.0.50", +// not the underlying SHA. +// +// We achieve test isolation by save/restore on the package-level Version +// variable; t.Cleanup ensures parallel/subsequent tests see the original. +func TestVersion_LdflagsOverride(t *testing.T) { + original := Version + t.Cleanup(func() { Version = original }) + + Version = "v2.0.50-test" + got := readBuildInfo() + if got.Version != "v2.0.50-test" { + t.Errorf("Version = %q, want %q (ldflags-supplied Version must take priority over VCS fallback)", + got.Version, "v2.0.50-test") + } +} diff --git a/internal/api/router/router.go b/internal/api/router/router.go index 6bb08e4..c39951f 100644 --- a/internal/api/router/router.go +++ b/internal/api/router/router.go @@ -68,6 +68,11 @@ type HandlerRegistry struct { HealthChecks *handler.HealthCheckHandler BulkRevocation handler.BulkRevocationHandler RenewalPolicies handler.RenewalPolicyHandler + // Version handles GET /api/v1/version (U-3 ride-along, + // cat-u-no_version_endpoint). Wired through the no-auth dispatch in + // cmd/server/main.go so probes and rollout systems can read build + // identity without Bearer credentials. See handler/version.go. + Version handler.VersionHandler } // RegisterHandlers sets up all API routes with their handlers. @@ -89,6 +94,17 @@ func (r *Router) RegisterHandlers(reg HandlerRegistry) { middleware.CORS, middleware.ContentType, )) + // Version endpoint (no auth middleware — used by rollout probes that + // don't carry Bearer tokens; the dispatch layer in cmd/server/main.go + // also routes /api/v1/version through the no-auth chain). U-3 ride-along + // (cat-u-no_version_endpoint, P2). The handler reads + // runtime/debug.BuildInfo for VCS attribution; ldflags-supplied Version + // is preferred when present. + r.mux.Handle("GET /api/v1/version", middleware.Chain( + reg.Version, + middleware.CORS, + middleware.ContentType, + )) // Auth check endpoint (uses full middleware chain via r.Register) r.Register("GET /api/v1/auth/check", http.HandlerFunc(reg.Health.AuthCheck)) diff --git a/internal/config/config.go b/internal/config/config.go index 0d9fe3b..4b7bfc1 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -709,6 +709,16 @@ type DatabaseConfig struct { URL string MaxConnections int MigrationsPath string + + // DemoSeed, when true, makes the server apply + // `/seed_demo.sql` after the baseline `seed.sql`. Set + // via CERTCTL_DEMO_SEED. The compose demo overlay + // (deploy/docker-compose.demo.yml) sets this to keep the demo path + // alive after U-3 dropped initdb-mounted seed files. The seed file + // uses ON CONFLICT (id) DO NOTHING so re-running on a populated + // database is safe; missing-file is a no-op (returns nil) so a + // minimal-image deploy that strips seed_demo.sql still boots cleanly. + DemoSeed bool } // SchedulerConfig contains scheduler timing configuration. @@ -921,6 +931,7 @@ func Load() (*Config, error) { URL: getEnv("CERTCTL_DATABASE_URL", "postgres://localhost/certctl"), MaxConnections: getEnvInt("CERTCTL_DATABASE_MAX_CONNS", 25), MigrationsPath: getEnv("CERTCTL_DATABASE_MIGRATIONS_PATH", "./migrations"), + DemoSeed: getEnvBool("CERTCTL_DEMO_SEED", false), }, Scheduler: SchedulerConfig{ RenewalCheckInterval: getEnvDuration("CERTCTL_SCHEDULER_RENEWAL_CHECK_INTERVAL", 1*time.Hour), diff --git a/internal/repository/postgres/db.go b/internal/repository/postgres/db.go index 11b0f4b..361db7d 100644 --- a/internal/repository/postgres/db.go +++ b/internal/repository/postgres/db.go @@ -131,3 +131,111 @@ func RunMigrations(db *sql.DB, migrationsPath string) error { return nil } + +// RunSeed reads and executes the baseline seed SQL file from the migrations +// directory. Designed to run AFTER RunMigrations so every column referenced by +// the seed is already in place. +// +// U-3 (P1, cat-u-seed_initdb_schema_drift): pre-U-3 the deploy compose stack +// mounted both a hand-curated subset of `migrations/*.up.sql` and `seed.sql` +// into postgres `/docker-entrypoint-initdb.d/`. Postgres applied them at +// initdb time. When `seed.sql` was updated to reference columns added by +// migrations *after* the mounted cutoff (e.g., `policy_rules.severity` from +// `000013_policy_rule_severity.up.sql`), initdb crashed during the seed step +// and the container was reported `unhealthy` indefinitely — bare +// `docker compose -f deploy/docker-compose.yml up -d --build` from a fresh +// clone of v2.0.50 hit this on the first try (GitHub #10 reopened by +// mikeakasully). Helm and the example compose files were already runtime- +// only (Path B) and worked through the same window. +// +// Post-U-3 the compose stack drops all initdb mounts; postgres comes up with +// an empty schema; the server applies all migrations via RunMigrations and +// then this function applies the seed. Single source of truth, removes the +// drift hazard architecturally. +// +// The seed file is expected at `/seed.sql`. Missing-file is +// treated as a no-op (returns nil) so deployments that explicitly remove the +// seed (custom packaging, cert-manager managed schemas) don't break. +// +// Idempotency: every INSERT in the shipped seed.sql uses +// `ON CONFLICT (id) DO NOTHING`, so re-running on a populated DB is safe. +// This function is invoked on every server start, so the contract MUST hold. +// +// Demo seed: `seed_demo.sql` is applied separately by RunDemoSeed below +// when CERTCTL_DEMO_SEED=true (see internal/config/config.go::DemoSeed). +// Splitting demo from baseline keeps a default deploy from accidentally +// landing 90-days-of-fake-history into a real customer database, while +// still giving the demo overlay a single source of truth (no more initdb +// mounts). The demo seed itself uses ON CONFLICT (id) DO NOTHING so it's +// idempotent; missing-file is also tolerated (custom packaging may strip +// seed_demo.sql to shrink the image). +func RunSeed(db *sql.DB, migrationsPath string) error { + if _, err := os.Stat(migrationsPath); os.IsNotExist(err) { + return fmt.Errorf("migrations directory not found: %s", migrationsPath) + } + + seedPath := filepath.Join(migrationsPath, "seed.sql") + content, err := os.ReadFile(seedPath) + if err != nil { + if os.IsNotExist(err) { + // Missing seed.sql is acceptable — operators may have removed it + // for custom-packaging reasons. Return nil rather than fail-loud. + return nil + } + return fmt.Errorf("failed to read seed file %s: %w", seedPath, err) + } + + if _, err := db.Exec(string(content)); err != nil { + return fmt.Errorf("failed to execute seed file %s: %w", seedPath, err) + } + + return nil +} + +// RunDemoSeed applies the demo overlay seed file +// (`/seed_demo.sql`) on top of the baseline seed. +// +// U-3 follow-on: pre-U-3 the demo overlay mounted `seed_demo.sql` into +// postgres `/docker-entrypoint-initdb.d/` and relied on initdb to apply it +// alongside the schema. Once U-3 dropped the initdb migration mounts, that +// path stopped working — postgres comes up empty, and the demo seed +// references tables (issuers, certificates, etc.) that wouldn't exist yet +// at initdb time. RunDemoSeed restores the demo capability through the +// same runtime path RunSeed uses, gated by CERTCTL_DEMO_SEED so production +// deploys never accidentally land the fake-history rows. +// +// Order contract: must run AFTER RunSeed so foreign-key references from +// demo rows to baseline rows (e.g., demo certificates referencing +// `rp-default` from baseline) resolve cleanly. The caller in +// cmd/server/main.go enforces this order. +// +// Missing-file is acceptable (returns nil) — operators packaging a +// production-only image often strip seed_demo.sql to shrink the artifact, +// and that should not break boot when CERTCTL_DEMO_SEED happens to be set. +// +// Idempotency: every INSERT in seed_demo.sql uses +// `ON CONFLICT (id) DO NOTHING`, so re-running on a populated DB is safe. +// Server restarts in demo mode therefore re-apply the file harmlessly. +func RunDemoSeed(db *sql.DB, migrationsPath string) error { + if _, err := os.Stat(migrationsPath); os.IsNotExist(err) { + return fmt.Errorf("migrations directory not found: %s", migrationsPath) + } + + seedPath := filepath.Join(migrationsPath, "seed_demo.sql") + content, err := os.ReadFile(seedPath) + if err != nil { + if os.IsNotExist(err) { + // Custom production packaging frequently strips this file. + // Fail-soft to preserve the U-3 contract: a missing seed file + // must not gate server boot. + return nil + } + return fmt.Errorf("failed to read demo seed file %s: %w", seedPath, err) + } + + if _, err := db.Exec(string(content)); err != nil { + return fmt.Errorf("failed to execute demo seed file %s: %w", seedPath, err) + } + + return nil +} diff --git a/internal/repository/postgres/notification.go b/internal/repository/postgres/notification.go index 3f997bb..dae2f7a 100644 --- a/internal/repository/postgres/notification.go +++ b/internal/repository/postgres/notification.go @@ -22,19 +22,37 @@ func NewNotificationRepository(db *sql.DB) *NotificationRepository { return &NotificationRepository{db: db} } -// Create stores a new notification +// Create stores a new notification. +// +// U-3 ride-along (cat-o-notification_created_at_dead_field, P2): the +// `created_at` column is added to notification_events by migration 000017. +// Pre-U-3 the Go domain.NotificationEvent had a CreatedAt field but the +// INSERT path never set it AND no DB column existed — the JSON API +// serialised the field as `0001-01-01T00:00:00Z`, breaking timestamp +// ordering on operator dashboards and any consumer that filtered by age. +// Post-U-3 the column exists with a NOT NULL DEFAULT NOW() backstop, and +// this INSERT explicitly sets it from the domain field. If the caller +// hasn't populated CreatedAt (zero-value time.Time) we substitute +// time.Now() so the row never carries the placeholder zero-time forward +// — the DEFAULT would handle this too, but emitting the value explicitly +// keeps the wire-level JSON consistent with what the row will hold once +// scanNotification reads it back, and prevents a clock-skew gap between +// "Go computed CreatedAt" and "DB applied DEFAULT NOW()" on the read path. func (r *NotificationRepository) Create(ctx context.Context, notif *domain.NotificationEvent) error { if notif.ID == "" { notif.ID = uuid.New().String() } + if notif.CreatedAt.IsZero() { + notif.CreatedAt = time.Now() + } err := r.db.QueryRowContext(ctx, ` INSERT INTO notification_events ( - id, type, certificate_id, channel, recipient, message, sent_at, status, error - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) + id, type, certificate_id, channel, recipient, message, sent_at, status, error, created_at + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) RETURNING id `, notif.ID, notif.Type, notif.CertificateID, notif.Channel, notif.Recipient, - notif.Message, notif.SentAt, notif.Status, notif.Error).Scan(¬if.ID) + notif.Message, notif.SentAt, notif.Status, notif.Error, notif.CreatedAt).Scan(¬if.ID) if err != nil { return fmt.Errorf("failed to create notification: %w", err) @@ -102,12 +120,14 @@ func (r *NotificationRepository) List(ctx context.Context, filter *repository.No // Get paginated results. I-005 extends the SELECT with the three retry // columns (retry_count / next_retry_at / last_error) so scanNotification - // can populate the new fields on domain.NotificationEvent. The column - // order here MUST stay in lockstep with scanNotification below. + // can populate the new fields on domain.NotificationEvent. U-3 extends + // it once more with `created_at` (column added by migration 000017) so + // the field is no longer serialized as 0001-01-01. The column order + // here MUST stay in lockstep with scanNotification below. offset := (filter.Page - 1) * filter.PerPage query := fmt.Sprintf(` SELECT id, type, certificate_id, channel, recipient, message, sent_at, status, error, - retry_count, next_retry_at, last_error + retry_count, next_retry_at, last_error, created_at FROM notification_events %s ORDER BY sent_at DESC NULLS LAST @@ -162,8 +182,14 @@ func (r *NotificationRepository) UpdateStatus(ctx context.Context, id string, st // scanNotification scans a notification from a row or rows. // -// I-005 extends the scan list from 9 → 12 columns (adds retry_count, -// next_retry_at, last_error). Every caller — List and the four new retry +// I-005 extended the scan list from 9 → 12 columns (adds retry_count, +// next_retry_at, last_error). U-3 extends it once more to 13 columns by +// appending `created_at` (column added by migration 000017, +// cat-o-notification_created_at_dead_field). CreatedAt scans into a +// non-pointer time.Time because the migration declares the column +// NOT NULL with DEFAULT NOW(). +// +// Every caller — List, ListRetryEligible, and the four other I-005 retry // methods below — funnels rows through this helper, so the SELECT column // order in every query must match the Scan order here exactly. RetryCount // scans into an `int` (migration 000016 declares the column NOT NULL with @@ -176,7 +202,7 @@ func scanNotification(scanner interface { var notif domain.NotificationEvent err := scanner.Scan(¬if.ID, ¬if.Type, ¬if.CertificateID, ¬if.Channel, ¬if.Recipient, ¬if.Message, ¬if.SentAt, ¬if.Status, ¬if.Error, - ¬if.RetryCount, ¬if.NextRetryAt, ¬if.LastError) + ¬if.RetryCount, ¬if.NextRetryAt, ¬if.LastError, ¬if.CreatedAt) if err != nil { return nil, fmt.Errorf("failed to scan notification: %w", err) @@ -248,7 +274,7 @@ func (r *NotificationRepository) ListRetryEligible(ctx context.Context, now time rows, err := r.db.QueryContext(ctx, ` SELECT id, type, certificate_id, channel, recipient, message, sent_at, status, error, - retry_count, next_retry_at, last_error + retry_count, next_retry_at, last_error, created_at FROM notification_events WHERE status = 'failed' AND next_retry_at IS NOT NULL diff --git a/internal/repository/postgres/notification_test.go b/internal/repository/postgres/notification_test.go index 09f629e..bab7ada 100644 --- a/internal/repository/postgres/notification_test.go +++ b/internal/repository/postgres/notification_test.go @@ -339,6 +339,95 @@ func TestNotificationRepository_Requeue(t *testing.T) { } } +// TestNotificationRepository_CreatedAt_IsPersisted is the U-3 ride-along +// regression for cat-o-notification_created_at_dead_field. Pre-U-3 the +// Go domain.NotificationEvent had a CreatedAt field but the DB had no +// column — JSON serialisation produced 0001-01-01T00:00:00Z, breaking +// timestamp ordering on operator dashboards. Post-U-3 migration 000017 +// adds the column NOT NULL DEFAULT NOW(), Create populates it, and +// scanNotification reads it back. +// +// The contract under test is round-trip equivalence: the timestamp the +// caller sets goes into the DB and comes back out unchanged (modulo +// PostgreSQL's microsecond precision). Truncate to microseconds before +// comparing because TIMESTAMPTZ rounds nanoseconds away. +func TestNotificationRepository_CreatedAt_IsPersisted(t *testing.T) { + tdb := getTestDB(t) + db := tdb.freshSchema(t) + repo := postgres.NewNotificationRepository(db) + ctx := context.Background() + + // A specific, recognisable timestamp. Truncated to microseconds so + // the post-roundtrip equality assertion isn't tripped up by Postgres + // dropping the nanosecond tail. + want := time.Now().UTC().Add(-2 * time.Hour).Truncate(time.Microsecond) + + notif := &domain.NotificationEvent{ + Type: domain.NotificationTypeExpirationWarning, + Channel: domain.NotificationChannelWebhook, + Recipient: "https://hooks.example.com/u3", + Message: "U-3 round-trip witness", + Status: string(domain.NotificationStatusPending), + CreatedAt: want, + } + if err := repo.Create(ctx, notif); err != nil { + t.Fatalf("Create failed: %v", err) + } + + // Re-read via List (which goes through scanNotification) so we're + // testing both the INSERT and SELECT halves of the U-3 plumbing. + got, err := repo.List(ctx, nil) + if err != nil { + t.Fatalf("List failed: %v", err) + } + if len(got) != 1 { + t.Fatalf("List returned %d rows, want 1", len(got)) + } + if !got[0].CreatedAt.Equal(want) { + t.Errorf("CreatedAt round-trip mismatch:\n set: %v\n got: %v\n"+ + "Pre-U-3 this would have come back as 0001-01-01 because the column didn't exist.", + want, got[0].CreatedAt) + } +} + +// TestNotificationRepository_CreatedAt_DefaultsToNow verifies the helper +// behavior in Create: when the caller hands over an event with the +// zero-value CreatedAt, Create substitutes time.Now() rather than +// trusting the DB DEFAULT. This keeps wire-level JSON consistent with +// what the row will hold once it's read back, and avoids a clock-skew +// gap between "Go computed the timestamp" and "DB applied DEFAULT NOW()". +func TestNotificationRepository_CreatedAt_DefaultsToNow(t *testing.T) { + tdb := getTestDB(t) + db := tdb.freshSchema(t) + repo := postgres.NewNotificationRepository(db) + ctx := context.Background() + + before := time.Now().UTC().Add(-time.Second) + + notif := &domain.NotificationEvent{ + Type: domain.NotificationTypeExpirationWarning, + Channel: domain.NotificationChannelWebhook, + Recipient: "https://hooks.example.com/zerotime", + Message: "U-3 zero-time fallback", + Status: string(domain.NotificationStatusPending), + // CreatedAt left zero on purpose — the contract is that Create + // fills it in from time.Now() when it's unset. + } + if err := repo.Create(ctx, notif); err != nil { + t.Fatalf("Create failed: %v", err) + } + + after := time.Now().UTC().Add(time.Second) + + if notif.CreatedAt.IsZero() { + t.Fatalf("CreatedAt is still zero after Create — the fallback in NotificationRepository.Create did not fire") + } + if notif.CreatedAt.Before(before) || notif.CreatedAt.After(after) { + t.Errorf("CreatedAt = %v is outside the [%v, %v] window — the substituted time.Now() should fall inside the test's wall-clock bracket", + notif.CreatedAt, before, after) + } +} + // ─── Helpers ────────────────────────────────────────────────────────────── // past returns a stable "5 minutes ago" time for fixture seeding. Truncated diff --git a/internal/repository/postgres/renewal_policy.go b/internal/repository/postgres/renewal_policy.go index 8b6f1bf..547e57a 100644 --- a/internal/repository/postgres/renewal_policy.go +++ b/internal/repository/postgres/renewal_policy.go @@ -36,7 +36,7 @@ func NewRenewalPolicyRepository(db *sql.DB) *RenewalPolicyRepository { // and require domain-layer churn we're not taking on in this change. const renewalPolicyColumns = ` id, name, renewal_window_days, auto_renew, max_retries, - retry_interval_minutes, alert_thresholds_days, created_at, updated_at + retry_interval_seconds, alert_thresholds_days, created_at, updated_at ` // scanRenewalPolicy decodes one renewal_policies row from a Row or Rows @@ -170,7 +170,7 @@ func (r *RenewalPolicyRepository) Create(ctx context.Context, policy *domain.Ren insertSQL := ` INSERT INTO renewal_policies ( id, name, renewal_window_days, auto_renew, max_retries, - retry_interval_minutes, alert_thresholds_days, created_at, updated_at + retry_interval_seconds, alert_thresholds_days, created_at, updated_at ) VALUES ($1, $2, $3, $4, $5, $6, $7, NOW(), NOW()) RETURNING ` + renewalPolicyColumns @@ -240,7 +240,7 @@ func (r *RenewalPolicyRepository) Update(ctx context.Context, id string, policy renewal_window_days = $3, auto_renew = $4, max_retries = $5, - retry_interval_minutes = $6, + retry_interval_seconds = $6, alert_thresholds_days = $7, updated_at = NOW() WHERE id = $1 diff --git a/internal/repository/postgres/renewal_policy_test.go b/internal/repository/postgres/renewal_policy_test.go index eda633f..68fbe78 100644 --- a/internal/repository/postgres/renewal_policy_test.go +++ b/internal/repository/postgres/renewal_policy_test.go @@ -45,7 +45,7 @@ func TestRenewalPolicyRepository_CRUD(t *testing.T) { RenewalWindowDays: 30, AutoRenew: true, MaxRetries: 5, - RetryInterval: 3600, // stored in retry_interval_minutes column; passthrough + RetryInterval: 3600, // stored as seconds in retry_interval_seconds column (renamed in 000017_db_coupling_cleanup, U-3) AlertThresholdsDays: []int{30, 14, 7, 0}, } diff --git a/internal/repository/postgres/repo_test.go b/internal/repository/postgres/repo_test.go index dc38d74..8b35b20 100644 --- a/internal/repository/postgres/repo_test.go +++ b/internal/repository/postgres/repo_test.go @@ -78,7 +78,7 @@ func insertCertPrereqsRaw(t *testing.T, db *sql.DB, ctx context.Context, suffix } // Create renewal policy - _, err = db.ExecContext(ctx, `INSERT INTO renewal_policies (id, name, renewal_window_days, auto_renew, max_retries, retry_interval_minutes, created_at, updated_at) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, + _, err = db.ExecContext(ctx, `INSERT INTO renewal_policies (id, name, renewal_window_days, auto_renew, max_retries, retry_interval_seconds, created_at, updated_at) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, policyID, "Policy "+suffix, 30, true, 3, 60, now, now) if err != nil { t.Fatalf("insertCertPrereqs: create renewal_policy failed: %v", err) diff --git a/internal/repository/postgres/seed_test.go b/internal/repository/postgres/seed_test.go new file mode 100644 index 0000000..3249d2f --- /dev/null +++ b/internal/repository/postgres/seed_test.go @@ -0,0 +1,246 @@ +// Integration tests for the U-3 schema-vs-seed coupling fix. +// +// Pre-U-3 the deploy compose stack mounted both a hand-curated subset of +// `migrations/*.up.sql` and `seed.sql` into postgres +// `/docker-entrypoint-initdb.d/`. Postgres applied them at initdb time. +// When `seed.sql` was updated to reference columns added by migrations +// *after* the mounted cutoff (e.g., `policy_rules.severity` from +// `000013_policy_rule_severity.up.sql`), initdb crashed during the seed +// step and the container was reported `unhealthy` indefinitely. +// +// Post-U-3 the schema is built EXCLUSIVELY by the server at startup via +// internal/repository/postgres.RunMigrations + RunSeed. These tests pin +// that contract: RunSeed must complete without error against a freshly +// migrated database, and re-application must be idempotent so server +// restarts don't double-insert. +// +// Skipped under -short to keep CI fast lanes green; the integration lane +// runs them via the testcontainers harness. +package postgres_test + +import ( + "context" + "database/sql" + "testing" + + "github.com/shankar0123/certctl/internal/repository/postgres" +) + +// TestRunSeed_AppliesIdempotently verifies the U-3 contract that RunSeed +// can be called repeatedly against a populated database without error and +// without producing duplicate rows. The server invokes RunSeed on EVERY +// boot (it has no migration-state table to skip from), so any non- +// idempotent INSERT in seed.sql would crash the container loop on the +// second start. +// +// The assertion uses renewal_policies.id='rp-default' as a witness — that +// row is the most-referenced FK target in the seed (it's the default +// renewal policy attached to every certificate that doesn't override). +// If the seed double-inserted, we'd see SQLSTATE 23505 from the second +// RunSeed call. If the seed silently ON CONFLICT-DO-NOTHING'd as +// designed, the row count stays at exactly 1. +func TestRunSeed_AppliesIdempotently(t *testing.T) { + tdb := getTestDB(t) + db := tdb.freshSchema(t) + ctx := context.Background() + + migrationsPath := findMigrationsDir() + + // Apply the seed twice — second call simulates a server restart on a + // populated database. Both must succeed; pre-U-3 the second call + // would fail with 23505 if any INSERT lacked ON CONFLICT. + if err := postgres.RunSeed(db, migrationsPath); err != nil { + t.Fatalf("RunSeed (first call) returned error: %v", err) + } + if err := postgres.RunSeed(db, migrationsPath); err != nil { + t.Fatalf("RunSeed (second call — idempotency check) returned error: %v\n"+ + "This means the seed produced a duplicate row; every INSERT in seed.sql "+ + "must use ON CONFLICT (id) DO NOTHING because the server applies the "+ + "seed on EVERY start.", err) + } + + // Witness check: rp-default is the renewal policy every cert defaults + // to. Exactly one row must exist after two seed applications. + var count int + err := db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM renewal_policies WHERE id = 'rp-default'`, + ).Scan(&count) + if err != nil { + t.Fatalf("witness query failed: %v", err) + } + if count != 1 { + t.Errorf("renewal_policies WHERE id='rp-default' returned %d rows after two RunSeed calls; want exactly 1 (ON CONFLICT idempotency contract)", count) + } +} + +// TestRunSeed_MissingFileIsNoOp verifies the fail-soft contract documented +// on RunSeed: an operator who deletes seed.sql for custom packaging (CI +// pipelines that bake their own seeds, cert-manager managed deployments) +// must still get a healthy server boot. RunSeed returning nil for a +// missing file is the only way to hold this contract — returning an error +// would force every minimal-image deployment to ship the seed file just +// to satisfy a no-op load. +// +// We point at a directory that exists (empty temp dir) but contains no +// seed.sql. RunSeed must return nil silently. +func TestRunSeed_MissingFileIsNoOp(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + // Use a brand-new empty directory so seed.sql is unambiguously absent. + emptyDir := t.TempDir() + + // Pass a nil *sql.DB on purpose — RunSeed must short-circuit on the + // missing file BEFORE touching the DB. If the implementation ever + // regresses and tries to db.Exec(string(content)) with nil content, + // this will surface as a nil-deref instead of a silent corruption. + var db *sql.DB + if err := postgres.RunSeed(db, emptyDir); err != nil { + t.Fatalf("RunSeed against an empty directory should return nil; got: %v", err) + } +} + +// TestRunDemoSeed_AppliesIdempotently mirrors the RunSeed idempotency +// contract for the demo overlay. The compose demo stack +// (deploy/docker-compose.demo.yml) sets CERTCTL_DEMO_SEED=true; the +// server applies seed_demo.sql at every boot. Same constraint as the +// baseline seed: if any INSERT lacks ON CONFLICT, the server will +// crash-loop on restart. +// +// Witness: seed_demo.sql inserts t-platform into the teams table at line +// 11. That row is referenced by every demo-team-owned certificate, so +// duplicate-insertion would block the entire demo on restart. +func TestRunDemoSeed_AppliesIdempotently(t *testing.T) { + tdb := getTestDB(t) + db := tdb.freshSchema(t) + ctx := context.Background() + + migrationsPath := findMigrationsDir() + + // Order matters — RunSeed must run first so the FK targets the demo + // seed depends on (rp-* renewal policies, etc.) exist before the + // demo INSERTs run. This mirrors the order in cmd/server/main.go. + if err := postgres.RunSeed(db, migrationsPath); err != nil { + t.Fatalf("RunSeed prerequisite failed: %v", err) + } + + if err := postgres.RunDemoSeed(db, migrationsPath); err != nil { + t.Fatalf("RunDemoSeed (first call) returned error: %v", err) + } + if err := postgres.RunDemoSeed(db, migrationsPath); err != nil { + t.Fatalf("RunDemoSeed (second call — idempotency check) returned error: %v", err) + } + + var count int + err := db.QueryRowContext(ctx, + `SELECT COUNT(*) FROM teams WHERE id = 't-platform'`, + ).Scan(&count) + if err != nil { + t.Fatalf("witness query failed: %v", err) + } + if count != 1 { + t.Errorf("teams WHERE id='t-platform' returned %d rows after two RunDemoSeed calls; want exactly 1", count) + } +} + +// TestMigration000017_RetryIntervalRename verifies the U-3 ride-along +// column rename: renewal_policies.retry_interval_minutes → +// retry_interval_seconds (cat-o-retry_interval_unit_mismatch). The unit +// was always seconds in practice — the column name lied. Migration 000017 +// renames the column with a DO $$ guard so re-application is safe. +// +// After all migrations have been applied (which the test harness does in +// freshSchema), the new column must exist and the old column must NOT. +// information_schema.columns is the source of truth for both checks. +func TestMigration000017_RetryIntervalRename(t *testing.T) { + tdb := getTestDB(t) + db := tdb.freshSchema(t) + ctx := context.Background() + + // Helper — true iff the named column exists on renewal_policies. + hasColumn := func(name string) bool { + t.Helper() + var n int + err := db.QueryRowContext(ctx, ` + SELECT COUNT(*) FROM information_schema.columns + WHERE table_name = 'renewal_policies' AND column_name = $1 + `, name).Scan(&n) + if err != nil { + t.Fatalf("information_schema query for column %q failed: %v", name, err) + } + return n > 0 + } + + if !hasColumn("retry_interval_seconds") { + t.Error("renewal_policies.retry_interval_seconds is missing — migration 000017 did not apply, or it was applied before the rename block") + } + if hasColumn("retry_interval_minutes") { + t.Error("renewal_policies.retry_interval_minutes still exists — the rename in migration 000017 must drop the old name (cat-o-retry_interval_unit_mismatch)") + } +} + +// TestMigration000017_NotificationCreatedAt verifies the U-3 ride-along +// column add: notification_events.created_at NOT NULL DEFAULT NOW() +// (cat-o-notification_created_at_dead_field). Pre-U-3 the Go domain had +// the field but the DB lacked the column, so the JSON API serialised +// 0001-01-01. +func TestMigration000017_NotificationCreatedAt(t *testing.T) { + tdb := getTestDB(t) + db := tdb.freshSchema(t) + ctx := context.Background() + + var dataType, isNullable, columnDefault sql.NullString + err := db.QueryRowContext(ctx, ` + SELECT data_type, is_nullable, column_default + FROM information_schema.columns + WHERE table_name = 'notification_events' AND column_name = 'created_at' + `).Scan(&dataType, &isNullable, &columnDefault) + if err != nil { + t.Fatalf("information_schema query for created_at failed: %v\n"+ + "Migration 000017 should have added notification_events.created_at TIMESTAMPTZ NOT NULL DEFAULT NOW().", err) + } + + if dataType.String != "timestamp with time zone" { + t.Errorf("notification_events.created_at data_type = %q, want %q", + dataType.String, "timestamp with time zone") + } + if isNullable.String != "NO" { + t.Errorf("notification_events.created_at is_nullable = %q, want NO (the column must be NOT NULL so legacy rows get the DEFAULT)", + isNullable.String) + } + if columnDefault.String == "" { + t.Error("notification_events.created_at has no DEFAULT — legacy rows added before migration 000017 would fail the NOT NULL gate without one") + } +} + +// TestMigration000017_HealthCheckOrphansDropped verifies the U-3 +// ride-along column drop: network_scan_targets lost the orphan +// health_check_enabled / health_check_interval_seconds columns +// (cat-o-health_check_column_orphans). These were declared by an early +// migration but never wired into Go code — schema noise that confused +// operators reading raw SQL. Migration 000017 drops them. +func TestMigration000017_HealthCheckOrphansDropped(t *testing.T) { + tdb := getTestDB(t) + db := tdb.freshSchema(t) + ctx := context.Background() + + hasColumn := func(name string) bool { + t.Helper() + var n int + err := db.QueryRowContext(ctx, ` + SELECT COUNT(*) FROM information_schema.columns + WHERE table_name = 'network_scan_targets' AND column_name = $1 + `, name).Scan(&n) + if err != nil { + t.Fatalf("information_schema query for column %q failed: %v", name, err) + } + return n > 0 + } + + for _, col := range []string{"health_check_enabled", "health_check_interval_seconds"} { + if hasColumn(col) { + t.Errorf("network_scan_targets.%s still exists — migration 000017 must drop it (cat-o-health_check_column_orphans)", col) + } + } +} diff --git a/migrations/000017_db_coupling_cleanup.down.sql b/migrations/000017_db_coupling_cleanup.down.sql new file mode 100644 index 0000000..f8b2d0e --- /dev/null +++ b/migrations/000017_db_coupling_cleanup.down.sql @@ -0,0 +1,46 @@ +-- Migration 000017 (down): reverse the U-3 bundle. +-- +-- Operators almost certainly never need this — each block in the up +-- migration was a strict improvement (column-name truth, dead-schema +-- removal, missing-column add). The down migration exists for +-- documentation and disaster-recovery completeness only. +-- +-- Idempotent: each block uses the standard IF EXISTS / IF NOT EXISTS +-- guards plus a DO $$ guard on the rename to handle re-application. +-- Reverses the up migration's blocks in reverse order. + +-- (3) Re-add the orphan health_check columns at their original defaults. +-- +-- Note: re-adding does NOT restore the auto-health-check feature — +-- that code was never written. The column values revert to the +-- DEFAULT FALSE / 300 baseline that operators saw pre-U-3. +ALTER TABLE network_scan_targets + ADD COLUMN IF NOT EXISTS health_check_enabled BOOLEAN DEFAULT FALSE, + ADD COLUMN IF NOT EXISTS health_check_interval_seconds INTEGER DEFAULT 300; + +-- (2) Drop the notification_events.created_at column. +-- +-- This re-introduces the cat-o-notification_created_at_dead_field bug +-- (Go field with no DB column → API serializes 0001-01-01). Only roll +-- back if you've also rolled back the Go-side INSERT path that sets +-- created_at, otherwise INSERTs will fail with "column created_at does +-- not exist". +ALTER TABLE notification_events + DROP COLUMN IF EXISTS created_at; + +-- (1) Rename the renewal_policies column back to the misleading name. +-- +-- Re-introduces cat-o-retry_interval_unit_mismatch. Operators running +-- raw SQL revert to the 60x confusion. No data conversion (values are +-- still seconds; the column label lies again). +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'renewal_policies' + AND column_name = 'retry_interval_seconds' + ) THEN + ALTER TABLE renewal_policies + RENAME COLUMN retry_interval_seconds TO retry_interval_minutes; + END IF; +END $$; diff --git a/migrations/000017_db_coupling_cleanup.up.sql b/migrations/000017_db_coupling_cleanup.up.sql new file mode 100644 index 0000000..e0d344d --- /dev/null +++ b/migrations/000017_db_coupling_cleanup.up.sql @@ -0,0 +1,81 @@ +-- Migration 000017: DB coupling cleanup (U-3 bundle). +-- +-- Closes three audit findings that share the migrations/ surface and the +-- "schema vs Go vs label drifts in different directions" pattern: +-- +-- * cat-o-retry_interval_unit_mismatch (P1): +-- renewal_policies.retry_interval_minutes column stored seconds, named +-- minutes. Operators running raw SQL got 60x confusion. +-- +-- * cat-o-notification_created_at_dead_field (P2): +-- internal/domain/notification.go::NotificationEvent.CreatedAt was +-- tagged json:"created_at" with no DB column behind it. Every API +-- response serialized 0001-01-01T00:00:00Z. Visible zero-value +-- timestamp on every notification row in the dashboard. +-- +-- * cat-o-health_check_column_orphans (P1): +-- migration 000011 added network_scan_targets.health_check_enabled + +-- .health_check_interval_seconds. No Go field decoded either column; +-- no handler exposed them; OpenAPI schema didn't carry them. The +-- auto-health-check feature was never wired through. Removing dead +-- schema is cheaper than completing dead code; if the feature gets +-- revived, a future migration can re-add the columns alongside the +-- Go-side wiring. +-- +-- Idempotency: RunMigrations at internal/repository/postgres/db.go has +-- no applied-tracking table — every server restart re-applies every +-- migration in sequence. Each block in this file MUST be safe to re-run +-- on a database that has already had it applied. The RENAME COLUMN in +-- block (1) is wrapped in a DO $$ guard that checks information_schema +-- before renaming; the ADD COLUMN in (2) and the DROP COLUMNs in (3) +-- use the standard IF NOT EXISTS / IF EXISTS clauses. +-- +-- See the U-3 closure entry in +-- coverage-gap-audit-2026-04-24-v5/unified-audit.md and CHANGELOG.md +-- for the full rationale, the bundled-fix list, and the architectural +-- shift to runtime-only migration application. + +-- (1) cat-o-retry_interval_unit_mismatch — rename column to match unit. +-- +-- The values stored in this column have always been seconds (validator +-- at internal/service/renewal_policy.go enforces a [60, 86400] range +-- inclusive — 60 seconds to 24 hours, unambiguously seconds). The +-- column name was the bug; data conversion is a no-op. The Go field +-- has always been tagged json:"retry_interval_seconds", so the API +-- shape is unchanged. +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'renewal_policies' + AND column_name = 'retry_interval_minutes' + ) THEN + ALTER TABLE renewal_policies + RENAME COLUMN retry_interval_minutes TO retry_interval_seconds; + END IF; +END $$; + +-- (2) cat-o-notification_created_at_dead_field — add the missing column. +-- +-- DEFAULT NOW() back-fills existing rows with the migration apply +-- timestamp. Acceptable trade-off: those rows had no real CreatedAt +-- info anyway (the field was a Go-only zero-value), and approximating +-- them with the migration time gives the dashboard a usable rendering +-- instead of '0001-01-01'. NOT NULL is enforced because the repo +-- INSERT path will set CreatedAt on every new row post-fix. +ALTER TABLE notification_events + ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(); + +-- (3) cat-o-health_check_column_orphans — drop unwired columns. +-- +-- migrations/000011_health_checks.up.sql added these two columns with +-- the intent of wiring auto-health-checks for network-scan-discovered +-- endpoints. The Go side was never written; no handler reads or writes +-- them; the OpenAPI NetworkScanTarget schema doesn't expose them. The +-- columns have been carrying their default values (false / 300) on +-- every row since shipping. Dropping them removes dead schema; the +-- network_scan_targets row size shrinks marginally and operators stop +-- seeing flag/interval columns that don't actually do anything. +ALTER TABLE network_scan_targets + DROP COLUMN IF EXISTS health_check_enabled, + DROP COLUMN IF EXISTS health_check_interval_seconds; diff --git a/migrations/seed.sql b/migrations/seed.sql index 50d7cc1..e420054 100644 --- a/migrations/seed.sql +++ b/migrations/seed.sql @@ -1,7 +1,7 @@ -- Seed data for certificate control plane -- Default renewal policy -INSERT INTO renewal_policies (id, name, renewal_window_days, auto_renew, max_retries, retry_interval_minutes, alert_thresholds_days) +INSERT INTO renewal_policies (id, name, renewal_window_days, auto_renew, max_retries, retry_interval_seconds, alert_thresholds_days) VALUES ( 'rp-default', 'default', diff --git a/migrations/seed_demo.sql b/migrations/seed_demo.sql index 3b6239b..232ee61 100644 --- a/migrations/seed_demo.sql +++ b/migrations/seed_demo.sql @@ -29,7 +29,7 @@ ON CONFLICT (id) DO NOTHING; -- ============================================================ -- 2. Policies -- ============================================================ -INSERT INTO renewal_policies (id, name, renewal_window_days, auto_renew, max_retries, retry_interval_minutes, alert_thresholds_days, created_at, updated_at) VALUES +INSERT INTO renewal_policies (id, name, renewal_window_days, auto_renew, max_retries, retry_interval_seconds, alert_thresholds_days, created_at, updated_at) VALUES ('rp-standard', 'Standard 30-day', 30, true, 3, 60, '[30, 14, 7, 0]'::jsonb, NOW() - INTERVAL '180 days', NOW() - INTERVAL '180 days'), ('rp-urgent', 'Urgent 14-day', 14, true, 5, 30, '[14, 7, 3, 0]'::jsonb, NOW() - INTERVAL '180 days', NOW() - INTERVAL '180 days'), ('rp-manual', 'Manual Only', 30, false, 0, 0, '[30, 14, 7, 0]'::jsonb, NOW() - INTERVAL '180 days', NOW() - INTERVAL '180 days')