diff --git a/deploy/ENVIRONMENTS.md b/deploy/ENVIRONMENTS.md index e42da27..d240199 100644 --- a/deploy/ENVIRONMENTS.md +++ b/deploy/ENVIRONMENTS.md @@ -122,6 +122,8 @@ The `volumes` section mounts 10 migration files into PostgreSQL's init directory **Expert note:** The numbered prefix pattern (`001_`, `002_`, ..., `020_`) ensures deterministic execution order. All migrations use `IF NOT EXISTS` and `ON CONFLICT DO NOTHING` for idempotency, so re-running them against an existing database is safe. +**Stateful volume — first-boot password binding (U-1).** The same "first boot only" semantics that govern migration scripts also govern `POSTGRES_PASSWORD`. The official `postgres` image runs `initdb` exactly once — when `/var/lib/postgresql/data` is empty — and that pass is the only time `POSTGRES_PASSWORD` is written into `pg_authid`. On every subsequent boot, the postgres container ignores the env var and authenticates against whatever password was baked into the data directory on the original `up`. Editing `POSTGRES_PASSWORD` in `.env` after a successful first boot therefore only updates the **certctl-server** container's `CERTCTL_DATABASE_URL` — postgres still expects the previous password, and the server fails to ping with `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). The certctl-server container surfaces this case explicitly: when SQLSTATE 28P01 fires at startup, the wrap text in `internal/repository/postgres/db.go::wrapPingError` points operators at the two remediation paths — destructive volume teardown via `docker compose -f deploy/docker-compose.yml down -v && up -d --build`, or non-destructive in-place rotation via `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '';"` followed by a server restart with the matching `POSTGRES_PASSWORD`. Use the destructive path on the demo / first-time setup; use the non-destructive path on any environment that holds data you want to keep. + #### certctl Server ```yaml diff --git a/docs/quickstart.md b/docs/quickstart.md index 5ac4a30..83dce9e 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -60,6 +60,8 @@ cp deploy/.env.example deploy/.env docker compose -f deploy/docker-compose.yml up -d --build ``` +> **Warning:** Edit `POSTGRES_PASSWORD` *before* the very first `docker compose up`. Postgres seeds the password into its data directory only on first boot of an empty volume — after that, the password is baked into `pg_authid` and the env var is ignored. If you boot once with the default and later change `POSTGRES_PASSWORD` in `.env`, the certctl-server container picks up the new value but postgres still authenticates against the old one, and the server logs `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). Two ways out: tear down the volume with `docker compose -f deploy/docker-compose.yml down -v` (this **deletes all data**) and bring up fresh, or rotate non-destructively with `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '';"` and then restart certctl-server with the matching `POSTGRES_PASSWORD`. + ### Docker Compose Environments The `deploy/` directory contains four compose files for different use cases: diff --git a/internal/repository/postgres/db.go b/internal/repository/postgres/db.go index be16e6d..11b0f4b 100644 --- a/internal/repository/postgres/db.go +++ b/internal/repository/postgres/db.go @@ -2,14 +2,26 @@ package postgres import ( "database/sql" + "errors" "fmt" "os" "path/filepath" "strings" - _ "github.com/lib/pq" + "github.com/lib/pq" ) +// pgErrInvalidPassword is the SQLSTATE for class 28 / code 28P01 — +// invalid_password — emitted by PostgreSQL when the client presents +// credentials that don't match pg_authid. Defined locally because the +// lib/pq package does not export named constants for SQLSTATE codes (it +// only exposes the typed string alias pq.ErrorCode and a name-lookup map +// at runtime). Pinned as a string constant rather than a pq.ErrorCode +// literal so the contract is grep-able from operator-facing log lines. +// +// Reference: https://www.postgresql.org/docs/16/errcodes-appendix.html +const pgErrInvalidPassword = "28P01" + // NewDB opens a PostgreSQL database connection and sets up connection pooling. func NewDB(connStr string) (*sql.DB, error) { db, err := sql.Open("postgres", connStr) @@ -23,12 +35,65 @@ func NewDB(connStr string) (*sql.DB, error) { // Ping to verify connection if err := db.Ping(); err != nil { - return nil, fmt.Errorf("failed to ping database: %w", err) + return nil, wrapPingError(err) } return db, nil } +// wrapPingError converts a db.Ping() failure into an operator-friendly +// diagnostic. The default wrap is the original opaque +// `"failed to ping database: "` shape. The exception is SQLSTATE 28P01 +// (invalid_password): when postgres rejects the server's credentials we emit +// extended guidance that names the most common operator misstep — editing +// POSTGRES_PASSWORD in `.env` after the postgres named volume has already +// been initialized — and lists both the destructive (`docker compose down -v`) +// and non-destructive (`ALTER ROLE`) remediations. +// +// U-1 (P1, GitHub #10): closes the audit-flagged +// cat-u-quickstart_postgres_password_volume_trap finding. The postgres +// docker-entrypoint runs initdb only when /var/lib/postgresql/data is empty; +// on subsequent boots the password baked into pg_authid on first boot wins +// over whatever the env var carries, so the env-vs-pg_authid divergence is +// intrinsic to how the postgres image bootstraps and cannot be fixed by us +// upstream of pg_authid. The ergonomic answer is to surface a clear +// diagnostic at the failure site so operators don't waste an hour on +// "is my password right" before discovering the volume needs to be torn +// down (or the role's password rotated in-place). +// +// The wrap chain is preserved via fmt.Errorf("%w", err) so callers using +// errors.As(err, &*pq.Error) on the returned value continue to work; this +// matches the audit's "no substring matching on err.Error()" requirement +// from the M-1 sentinel-error migration. +// +// Returns nil when err is nil so callers can defensively pipe through this +// helper without an extra branch. +func wrapPingError(err error) error { + if err == nil { + return nil + } + + var pqErr *pq.Error + if errors.As(err, &pqErr) && string(pqErr.Code) == pgErrInvalidPassword { + return fmt.Errorf( + "failed to ping database: postgres rejected the configured credentials "+ + "(SQLSTATE %s — invalid_password). If you recently rotated POSTGRES_PASSWORD "+ + "on a docker-compose deploy, the postgres container's data volume still "+ + "holds the previous password: initdb seeds POSTGRES_PASSWORD into pg_authid "+ + "only on first boot of a fresh data dir, so editing the env var after that "+ + "point updates only the certctl-server container. Reset destructively with "+ + "`docker compose -f deploy/docker-compose.yml down -v && "+ + "docker compose -f deploy/docker-compose.yml up -d --build` (this DESTROYS "+ + "all data in the postgres volume), or non-destructively with "+ + "`docker compose -f deploy/docker-compose.yml exec postgres "+ + "psql -U certctl -c \"ALTER ROLE certctl PASSWORD '';\"` "+ + "and then redeploy with the matching POSTGRES_PASSWORD. Underlying error: %w", + pgErrInvalidPassword, err) + } + + return fmt.Errorf("failed to ping database: %w", err) +} + // RunMigrations reads and executes SQL migration files from a directory. func RunMigrations(db *sql.DB, migrationsPath string) error { // Check if migrations directory exists diff --git a/internal/repository/postgres/db_test.go b/internal/repository/postgres/db_test.go new file mode 100644 index 0000000..887d462 --- /dev/null +++ b/internal/repository/postgres/db_test.go @@ -0,0 +1,168 @@ +// Internal-package tests for db.go — covers the diagnostic dispatch in +// wrapPingError. Lives in `package postgres` (not `postgres_test`) so it can +// call the unexported helper directly without exposing it on the API surface. +// +// Sibling integration tests in this directory live in `package postgres_test` +// (testcontainers-driven, schema-per-test). They exercise the live-DB +// happy path; this file owns the unit-level diagnostic dispatch and runs in +// `-short` mode without spinning up postgres. +// +// U-1 (P1, GitHub #10): closes the audit-flagged +// cat-u-quickstart_postgres_password_volume_trap finding by pinning the +// post-fix wrap-text contract for `db.Ping()` failures. Pre-U-1 every Ping +// error was wrapped with the same opaque `"failed to ping database: %w"`, +// so an operator who edited POSTGRES_PASSWORD after first-boot saw only +// `pq: password authentication failed for user "certctl"` in the server +// log with no pointer to the actual cause (postgres data dir retains the +// initial password from first-boot initdb; subsequent boots ignore the env +// var). Post-U-1 the SQLSTATE-28P01 path emits a multi-line diagnostic +// pointing at the down -v / ALTER ROLE remediation; non-auth failures +// retain the original wrap shape so verbose noise does not bleed into +// transient connection-refused / timeout paths. +package postgres + +import ( + "errors" + "strings" + "testing" + + "github.com/lib/pq" +) + +// TestWrapPingError_AuthFailureGuidance asserts the diagnostic wrap fires on +// SQLSTATE 28P01 (invalid_password) and contains all three contract elements: +// the SQLSTATE code (so operators can grep), the down-v destructive +// remediation, and the ALTER ROLE non-destructive remediation. Also asserts +// the wrap chain still satisfies errors.As(err, &*pq.Error) so callers that +// programmatically inspect the underlying postgres error code keep working. +func TestWrapPingError_AuthFailureGuidance(t *testing.T) { + t.Parallel() + + original := &pq.Error{ + Code: pq.ErrorCode("28P01"), + Message: `password authentication failed for user "certctl"`, + } + + wrapped := wrapPingError(original) + if wrapped == nil { + t.Fatal("wrapPingError returned nil for a non-nil input") + } + + got := wrapped.Error() + + // Contract elements — the operator-facing string is what we ship. + wantSubstrings := []string{ + "SQLSTATE 28P01", // operators grep on this + "POSTGRES_PASSWORD", // names the variable that traps + "first boot", // the mechanism in plain language + "down -v", // destructive remediation + "ALTER ROLE", // non-destructive remediation + } + for _, s := range wantSubstrings { + if !strings.Contains(got, s) { + t.Errorf("wrap text missing %q\ngot: %s", s, got) + } + } + + // Wrap chain must still expose the underlying *pq.Error for callers + // that want to inspect Code / Detail / Constraint fields. Pre-fix + // callers used errors.As(err, &pqErr) on the unwrapped Ping result; + // the new wrap is fmt.Errorf("...%w", err) so errors.As must walk it. + var pqErr *pq.Error + if !errors.As(wrapped, &pqErr) { + t.Fatalf("errors.As did not extract *pq.Error from wrapped chain: %v", wrapped) + } + if pqErr.Code != "28P01" { + t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "28P01") + } +} + +// TestWrapPingError_NonAuthErrorPreservesOriginalWrap guards against the +// guidance text bleeding into unrelated failure modes. SQLSTATE 08006 +// (connection_failure) is the canonical non-auth case — server unreachable, +// TLS handshake failure, network drop. The wrap should be the original +// shape so transient-error log noise does not include the (now lengthy) +// volume-state remediation paragraph. +func TestWrapPingError_NonAuthErrorPreservesOriginalWrap(t *testing.T) { + t.Parallel() + + original := &pq.Error{ + Code: pq.ErrorCode("08006"), + Message: "connection refused", + } + + wrapped := wrapPingError(original) + if wrapped == nil { + t.Fatal("wrapPingError returned nil for a non-nil input") + } + + got := wrapped.Error() + + // Original-wrap shape: prefix only, no guidance text. + const wantPrefix = "failed to ping database: " + if !strings.HasPrefix(got, wantPrefix) { + t.Errorf("expected prefix %q, got: %s", wantPrefix, got) + } + + // Negative assertions: guidance text MUST NOT appear on non-auth paths. + mustNotContain := []string{ + "SQLSTATE 08006", // we only call out 28P01 specifically + "POSTGRES_PASSWORD", + "down -v", + "ALTER ROLE", + } + for _, s := range mustNotContain { + if strings.Contains(got, s) { + t.Errorf("non-auth wrap leaked guidance substring %q\ngot: %s", s, got) + } + } + + // Wrap chain still walks for errors.As — same contract as auth path. + var pqErr *pq.Error + if !errors.As(wrapped, &pqErr) { + t.Fatalf("errors.As did not extract *pq.Error from non-auth wrapped chain: %v", wrapped) + } + if pqErr.Code != "08006" { + t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "08006") + } +} + +// TestWrapPingError_NonPqErrorPreservesOriginalWrap guards the network-level +// case: a pre-handshake failure (TCP refused, DNS, TLS) returns a +// non-*pq.Error from db.Ping(). errors.As must return false, the helper +// must fall through to the generic wrap, and the chain must remain walkable. +func TestWrapPingError_NonPqErrorPreservesOriginalWrap(t *testing.T) { + t.Parallel() + + original := errors.New("dial tcp 127.0.0.1:5432: connect: connection refused") + + wrapped := wrapPingError(original) + if wrapped == nil { + t.Fatal("wrapPingError returned nil for a non-nil input") + } + + got := wrapped.Error() + + const wantPrefix = "failed to ping database: " + if !strings.HasPrefix(got, wantPrefix) { + t.Errorf("expected prefix %q, got: %s", wantPrefix, got) + } + if strings.Contains(got, "SQLSTATE") || strings.Contains(got, "POSTGRES_PASSWORD") { + t.Errorf("network-level wrap leaked SQLSTATE/postgres guidance\ngot: %s", got) + } + if !errors.Is(wrapped, original) { + t.Errorf("errors.Is did not walk to original sentinel: %v", wrapped) + } +} + +// TestWrapPingError_NilReturnsNil — defensive contract: if Ping returned nil +// (no failure), the helper must not synthesize a fake error. This isn't on +// the documented call path (NewDB only invokes wrapPingError inside the +// `if err != nil` branch), but pinning it prevents a future refactor from +// regressing the contract silently. +func TestWrapPingError_NilReturnsNil(t *testing.T) { + t.Parallel() + if got := wrapPingError(nil); got != nil { + t.Errorf("wrapPingError(nil) = %v, want nil", got) + } +}