Merge branch 'fix/u1-postgres-password-volume-trap-diagnostic'

This commit is contained in:
shankar0123
2026-04-24 23:21:33 +00:00
4 changed files with 239 additions and 2 deletions
+2
View File
@@ -122,6 +122,8 @@ The `volumes` section mounts 10 migration files into PostgreSQL's init directory
**Expert note:** The numbered prefix pattern (`001_`, `002_`, ..., `020_`) ensures deterministic execution order. All migrations use `IF NOT EXISTS` and `ON CONFLICT DO NOTHING` for idempotency, so re-running them against an existing database is safe. **Expert note:** The numbered prefix pattern (`001_`, `002_`, ..., `020_`) ensures deterministic execution order. All migrations use `IF NOT EXISTS` and `ON CONFLICT DO NOTHING` for idempotency, so re-running them against an existing database is safe.
**Stateful volume — first-boot password binding (U-1).** The same "first boot only" semantics that govern migration scripts also govern `POSTGRES_PASSWORD`. The official `postgres` image runs `initdb` exactly once — when `/var/lib/postgresql/data` is empty — and that pass is the only time `POSTGRES_PASSWORD` is written into `pg_authid`. On every subsequent boot, the postgres container ignores the env var and authenticates against whatever password was baked into the data directory on the original `up`. Editing `POSTGRES_PASSWORD` in `.env` after a successful first boot therefore only updates the **certctl-server** container's `CERTCTL_DATABASE_URL` — postgres still expects the previous password, and the server fails to ping with `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). The certctl-server container surfaces this case explicitly: when SQLSTATE 28P01 fires at startup, the wrap text in `internal/repository/postgres/db.go::wrapPingError` points operators at the two remediation paths — destructive volume teardown via `docker compose -f deploy/docker-compose.yml down -v && up -d --build`, or non-destructive in-place rotation via `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"` followed by a server restart with the matching `POSTGRES_PASSWORD`. Use the destructive path on the demo / first-time setup; use the non-destructive path on any environment that holds data you want to keep.
#### certctl Server #### certctl Server
```yaml ```yaml
+2
View File
@@ -60,6 +60,8 @@ cp deploy/.env.example deploy/.env
docker compose -f deploy/docker-compose.yml up -d --build docker compose -f deploy/docker-compose.yml up -d --build
``` ```
> **Warning:** Edit `POSTGRES_PASSWORD` *before* the very first `docker compose up`. Postgres seeds the password into its data directory only on first boot of an empty volume — after that, the password is baked into `pg_authid` and the env var is ignored. If you boot once with the default and later change `POSTGRES_PASSWORD` in `.env`, the certctl-server container picks up the new value but postgres still authenticates against the old one, and the server logs `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). Two ways out: tear down the volume with `docker compose -f deploy/docker-compose.yml down -v` (this **deletes all data**) and bring up fresh, or rotate non-destructively with `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"` and then restart certctl-server with the matching `POSTGRES_PASSWORD`.
### Docker Compose Environments ### Docker Compose Environments
The `deploy/` directory contains four compose files for different use cases: The `deploy/` directory contains four compose files for different use cases:
+67 -2
View File
@@ -2,14 +2,26 @@ package postgres
import ( import (
"database/sql" "database/sql"
"errors"
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
_ "github.com/lib/pq" "github.com/lib/pq"
) )
// pgErrInvalidPassword is the SQLSTATE for class 28 / code 28P01 —
// invalid_password — emitted by PostgreSQL when the client presents
// credentials that don't match pg_authid. Defined locally because the
// lib/pq package does not export named constants for SQLSTATE codes (it
// only exposes the typed string alias pq.ErrorCode and a name-lookup map
// at runtime). Pinned as a string constant rather than a pq.ErrorCode
// literal so the contract is grep-able from operator-facing log lines.
//
// Reference: https://www.postgresql.org/docs/16/errcodes-appendix.html
const pgErrInvalidPassword = "28P01"
// NewDB opens a PostgreSQL database connection and sets up connection pooling. // NewDB opens a PostgreSQL database connection and sets up connection pooling.
func NewDB(connStr string) (*sql.DB, error) { func NewDB(connStr string) (*sql.DB, error) {
db, err := sql.Open("postgres", connStr) db, err := sql.Open("postgres", connStr)
@@ -23,12 +35,65 @@ func NewDB(connStr string) (*sql.DB, error) {
// Ping to verify connection // Ping to verify connection
if err := db.Ping(); err != nil { if err := db.Ping(); err != nil {
return nil, fmt.Errorf("failed to ping database: %w", err) return nil, wrapPingError(err)
} }
return db, nil return db, nil
} }
// wrapPingError converts a db.Ping() failure into an operator-friendly
// diagnostic. The default wrap is the original opaque
// `"failed to ping database: <inner>"` shape. The exception is SQLSTATE 28P01
// (invalid_password): when postgres rejects the server's credentials we emit
// extended guidance that names the most common operator misstep — editing
// POSTGRES_PASSWORD in `.env` after the postgres named volume has already
// been initialized — and lists both the destructive (`docker compose down -v`)
// and non-destructive (`ALTER ROLE`) remediations.
//
// U-1 (P1, GitHub #10): closes the audit-flagged
// cat-u-quickstart_postgres_password_volume_trap finding. The postgres
// docker-entrypoint runs initdb only when /var/lib/postgresql/data is empty;
// on subsequent boots the password baked into pg_authid on first boot wins
// over whatever the env var carries, so the env-vs-pg_authid divergence is
// intrinsic to how the postgres image bootstraps and cannot be fixed by us
// upstream of pg_authid. The ergonomic answer is to surface a clear
// diagnostic at the failure site so operators don't waste an hour on
// "is my password right" before discovering the volume needs to be torn
// down (or the role's password rotated in-place).
//
// The wrap chain is preserved via fmt.Errorf("%w", err) so callers using
// errors.As(err, &*pq.Error) on the returned value continue to work; this
// matches the audit's "no substring matching on err.Error()" requirement
// from the M-1 sentinel-error migration.
//
// Returns nil when err is nil so callers can defensively pipe through this
// helper without an extra branch.
func wrapPingError(err error) error {
if err == nil {
return nil
}
var pqErr *pq.Error
if errors.As(err, &pqErr) && string(pqErr.Code) == pgErrInvalidPassword {
return fmt.Errorf(
"failed to ping database: postgres rejected the configured credentials "+
"(SQLSTATE %s — invalid_password). If you recently rotated POSTGRES_PASSWORD "+
"on a docker-compose deploy, the postgres container's data volume still "+
"holds the previous password: initdb seeds POSTGRES_PASSWORD into pg_authid "+
"only on first boot of a fresh data dir, so editing the env var after that "+
"point updates only the certctl-server container. Reset destructively with "+
"`docker compose -f deploy/docker-compose.yml down -v && "+
"docker compose -f deploy/docker-compose.yml up -d --build` (this DESTROYS "+
"all data in the postgres volume), or non-destructively with "+
"`docker compose -f deploy/docker-compose.yml exec postgres "+
"psql -U certctl -c \"ALTER ROLE certctl PASSWORD '<new-password>';\"` "+
"and then redeploy with the matching POSTGRES_PASSWORD. Underlying error: %w",
pgErrInvalidPassword, err)
}
return fmt.Errorf("failed to ping database: %w", err)
}
// RunMigrations reads and executes SQL migration files from a directory. // RunMigrations reads and executes SQL migration files from a directory.
func RunMigrations(db *sql.DB, migrationsPath string) error { func RunMigrations(db *sql.DB, migrationsPath string) error {
// Check if migrations directory exists // Check if migrations directory exists
+168
View File
@@ -0,0 +1,168 @@
// Internal-package tests for db.go — covers the diagnostic dispatch in
// wrapPingError. Lives in `package postgres` (not `postgres_test`) so it can
// call the unexported helper directly without exposing it on the API surface.
//
// Sibling integration tests in this directory live in `package postgres_test`
// (testcontainers-driven, schema-per-test). They exercise the live-DB
// happy path; this file owns the unit-level diagnostic dispatch and runs in
// `-short` mode without spinning up postgres.
//
// U-1 (P1, GitHub #10): closes the audit-flagged
// cat-u-quickstart_postgres_password_volume_trap finding by pinning the
// post-fix wrap-text contract for `db.Ping()` failures. Pre-U-1 every Ping
// error was wrapped with the same opaque `"failed to ping database: %w"`,
// so an operator who edited POSTGRES_PASSWORD after first-boot saw only
// `pq: password authentication failed for user "certctl"` in the server
// log with no pointer to the actual cause (postgres data dir retains the
// initial password from first-boot initdb; subsequent boots ignore the env
// var). Post-U-1 the SQLSTATE-28P01 path emits a multi-line diagnostic
// pointing at the down -v / ALTER ROLE remediation; non-auth failures
// retain the original wrap shape so verbose noise does not bleed into
// transient connection-refused / timeout paths.
package postgres
import (
"errors"
"strings"
"testing"
"github.com/lib/pq"
)
// TestWrapPingError_AuthFailureGuidance asserts the diagnostic wrap fires on
// SQLSTATE 28P01 (invalid_password) and contains all three contract elements:
// the SQLSTATE code (so operators can grep), the down-v destructive
// remediation, and the ALTER ROLE non-destructive remediation. Also asserts
// the wrap chain still satisfies errors.As(err, &*pq.Error) so callers that
// programmatically inspect the underlying postgres error code keep working.
func TestWrapPingError_AuthFailureGuidance(t *testing.T) {
t.Parallel()
original := &pq.Error{
Code: pq.ErrorCode("28P01"),
Message: `password authentication failed for user "certctl"`,
}
wrapped := wrapPingError(original)
if wrapped == nil {
t.Fatal("wrapPingError returned nil for a non-nil input")
}
got := wrapped.Error()
// Contract elements — the operator-facing string is what we ship.
wantSubstrings := []string{
"SQLSTATE 28P01", // operators grep on this
"POSTGRES_PASSWORD", // names the variable that traps
"first boot", // the mechanism in plain language
"down -v", // destructive remediation
"ALTER ROLE", // non-destructive remediation
}
for _, s := range wantSubstrings {
if !strings.Contains(got, s) {
t.Errorf("wrap text missing %q\ngot: %s", s, got)
}
}
// Wrap chain must still expose the underlying *pq.Error for callers
// that want to inspect Code / Detail / Constraint fields. Pre-fix
// callers used errors.As(err, &pqErr) on the unwrapped Ping result;
// the new wrap is fmt.Errorf("...%w", err) so errors.As must walk it.
var pqErr *pq.Error
if !errors.As(wrapped, &pqErr) {
t.Fatalf("errors.As did not extract *pq.Error from wrapped chain: %v", wrapped)
}
if pqErr.Code != "28P01" {
t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "28P01")
}
}
// TestWrapPingError_NonAuthErrorPreservesOriginalWrap guards against the
// guidance text bleeding into unrelated failure modes. SQLSTATE 08006
// (connection_failure) is the canonical non-auth case — server unreachable,
// TLS handshake failure, network drop. The wrap should be the original
// shape so transient-error log noise does not include the (now lengthy)
// volume-state remediation paragraph.
func TestWrapPingError_NonAuthErrorPreservesOriginalWrap(t *testing.T) {
t.Parallel()
original := &pq.Error{
Code: pq.ErrorCode("08006"),
Message: "connection refused",
}
wrapped := wrapPingError(original)
if wrapped == nil {
t.Fatal("wrapPingError returned nil for a non-nil input")
}
got := wrapped.Error()
// Original-wrap shape: prefix only, no guidance text.
const wantPrefix = "failed to ping database: "
if !strings.HasPrefix(got, wantPrefix) {
t.Errorf("expected prefix %q, got: %s", wantPrefix, got)
}
// Negative assertions: guidance text MUST NOT appear on non-auth paths.
mustNotContain := []string{
"SQLSTATE 08006", // we only call out 28P01 specifically
"POSTGRES_PASSWORD",
"down -v",
"ALTER ROLE",
}
for _, s := range mustNotContain {
if strings.Contains(got, s) {
t.Errorf("non-auth wrap leaked guidance substring %q\ngot: %s", s, got)
}
}
// Wrap chain still walks for errors.As — same contract as auth path.
var pqErr *pq.Error
if !errors.As(wrapped, &pqErr) {
t.Fatalf("errors.As did not extract *pq.Error from non-auth wrapped chain: %v", wrapped)
}
if pqErr.Code != "08006" {
t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "08006")
}
}
// TestWrapPingError_NonPqErrorPreservesOriginalWrap guards the network-level
// case: a pre-handshake failure (TCP refused, DNS, TLS) returns a
// non-*pq.Error from db.Ping(). errors.As must return false, the helper
// must fall through to the generic wrap, and the chain must remain walkable.
func TestWrapPingError_NonPqErrorPreservesOriginalWrap(t *testing.T) {
t.Parallel()
original := errors.New("dial tcp 127.0.0.1:5432: connect: connection refused")
wrapped := wrapPingError(original)
if wrapped == nil {
t.Fatal("wrapPingError returned nil for a non-nil input")
}
got := wrapped.Error()
const wantPrefix = "failed to ping database: "
if !strings.HasPrefix(got, wantPrefix) {
t.Errorf("expected prefix %q, got: %s", wantPrefix, got)
}
if strings.Contains(got, "SQLSTATE") || strings.Contains(got, "POSTGRES_PASSWORD") {
t.Errorf("network-level wrap leaked SQLSTATE/postgres guidance\ngot: %s", got)
}
if !errors.Is(wrapped, original) {
t.Errorf("errors.Is did not walk to original sentinel: %v", wrapped)
}
}
// TestWrapPingError_NilReturnsNil — defensive contract: if Ping returned nil
// (no failure), the helper must not synthesize a fake error. This isn't on
// the documented call path (NewDB only invokes wrapPingError inside the
// `if err != nil` branch), but pinning it prevents a future refactor from
// regressing the contract silently.
func TestWrapPingError_NilReturnsNil(t *testing.T) {
t.Parallel()
if got := wrapPingError(nil); got != nil {
t.Errorf("wrapPingError(nil) = %v, want nil", got)
}
}