mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 21:31:34 +00:00
Merge branch 'fix/u1-postgres-password-volume-trap-diagnostic'
This commit is contained in:
@@ -122,6 +122,8 @@ The `volumes` section mounts 10 migration files into PostgreSQL's init directory
|
||||
|
||||
**Expert note:** The numbered prefix pattern (`001_`, `002_`, ..., `020_`) ensures deterministic execution order. All migrations use `IF NOT EXISTS` and `ON CONFLICT DO NOTHING` for idempotency, so re-running them against an existing database is safe.
|
||||
|
||||
**Stateful volume — first-boot password binding (U-1).** The same "first boot only" semantics that govern migration scripts also govern `POSTGRES_PASSWORD`. The official `postgres` image runs `initdb` exactly once — when `/var/lib/postgresql/data` is empty — and that pass is the only time `POSTGRES_PASSWORD` is written into `pg_authid`. On every subsequent boot, the postgres container ignores the env var and authenticates against whatever password was baked into the data directory on the original `up`. Editing `POSTGRES_PASSWORD` in `.env` after a successful first boot therefore only updates the **certctl-server** container's `CERTCTL_DATABASE_URL` — postgres still expects the previous password, and the server fails to ping with `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). The certctl-server container surfaces this case explicitly: when SQLSTATE 28P01 fires at startup, the wrap text in `internal/repository/postgres/db.go::wrapPingError` points operators at the two remediation paths — destructive volume teardown via `docker compose -f deploy/docker-compose.yml down -v && up -d --build`, or non-destructive in-place rotation via `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"` followed by a server restart with the matching `POSTGRES_PASSWORD`. Use the destructive path on the demo / first-time setup; use the non-destructive path on any environment that holds data you want to keep.
|
||||
|
||||
#### certctl Server
|
||||
|
||||
```yaml
|
||||
|
||||
@@ -60,6 +60,8 @@ cp deploy/.env.example deploy/.env
|
||||
docker compose -f deploy/docker-compose.yml up -d --build
|
||||
```
|
||||
|
||||
> **Warning:** Edit `POSTGRES_PASSWORD` *before* the very first `docker compose up`. Postgres seeds the password into its data directory only on first boot of an empty volume — after that, the password is baked into `pg_authid` and the env var is ignored. If you boot once with the default and later change `POSTGRES_PASSWORD` in `.env`, the certctl-server container picks up the new value but postgres still authenticates against the old one, and the server logs `pq: password authentication failed for user "certctl"` (SQLSTATE 28P01). Two ways out: tear down the volume with `docker compose -f deploy/docker-compose.yml down -v` (this **deletes all data**) and bring up fresh, or rotate non-destructively with `docker compose -f deploy/docker-compose.yml exec postgres psql -U certctl -c "ALTER ROLE certctl PASSWORD '<new>';"` and then restart certctl-server with the matching `POSTGRES_PASSWORD`.
|
||||
|
||||
### Docker Compose Environments
|
||||
|
||||
The `deploy/` directory contains four compose files for different use cases:
|
||||
|
||||
@@ -2,14 +2,26 @@ package postgres
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
"github.com/lib/pq"
|
||||
)
|
||||
|
||||
// pgErrInvalidPassword is the SQLSTATE for class 28 / code 28P01 —
|
||||
// invalid_password — emitted by PostgreSQL when the client presents
|
||||
// credentials that don't match pg_authid. Defined locally because the
|
||||
// lib/pq package does not export named constants for SQLSTATE codes (it
|
||||
// only exposes the typed string alias pq.ErrorCode and a name-lookup map
|
||||
// at runtime). Pinned as a string constant rather than a pq.ErrorCode
|
||||
// literal so the contract is grep-able from operator-facing log lines.
|
||||
//
|
||||
// Reference: https://www.postgresql.org/docs/16/errcodes-appendix.html
|
||||
const pgErrInvalidPassword = "28P01"
|
||||
|
||||
// NewDB opens a PostgreSQL database connection and sets up connection pooling.
|
||||
func NewDB(connStr string) (*sql.DB, error) {
|
||||
db, err := sql.Open("postgres", connStr)
|
||||
@@ -23,12 +35,65 @@ func NewDB(connStr string) (*sql.DB, error) {
|
||||
|
||||
// Ping to verify connection
|
||||
if err := db.Ping(); err != nil {
|
||||
return nil, fmt.Errorf("failed to ping database: %w", err)
|
||||
return nil, wrapPingError(err)
|
||||
}
|
||||
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// wrapPingError converts a db.Ping() failure into an operator-friendly
|
||||
// diagnostic. The default wrap is the original opaque
|
||||
// `"failed to ping database: <inner>"` shape. The exception is SQLSTATE 28P01
|
||||
// (invalid_password): when postgres rejects the server's credentials we emit
|
||||
// extended guidance that names the most common operator misstep — editing
|
||||
// POSTGRES_PASSWORD in `.env` after the postgres named volume has already
|
||||
// been initialized — and lists both the destructive (`docker compose down -v`)
|
||||
// and non-destructive (`ALTER ROLE`) remediations.
|
||||
//
|
||||
// U-1 (P1, GitHub #10): closes the audit-flagged
|
||||
// cat-u-quickstart_postgres_password_volume_trap finding. The postgres
|
||||
// docker-entrypoint runs initdb only when /var/lib/postgresql/data is empty;
|
||||
// on subsequent boots the password baked into pg_authid on first boot wins
|
||||
// over whatever the env var carries, so the env-vs-pg_authid divergence is
|
||||
// intrinsic to how the postgres image bootstraps and cannot be fixed by us
|
||||
// upstream of pg_authid. The ergonomic answer is to surface a clear
|
||||
// diagnostic at the failure site so operators don't waste an hour on
|
||||
// "is my password right" before discovering the volume needs to be torn
|
||||
// down (or the role's password rotated in-place).
|
||||
//
|
||||
// The wrap chain is preserved via fmt.Errorf("%w", err) so callers using
|
||||
// errors.As(err, &*pq.Error) on the returned value continue to work; this
|
||||
// matches the audit's "no substring matching on err.Error()" requirement
|
||||
// from the M-1 sentinel-error migration.
|
||||
//
|
||||
// Returns nil when err is nil so callers can defensively pipe through this
|
||||
// helper without an extra branch.
|
||||
func wrapPingError(err error) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var pqErr *pq.Error
|
||||
if errors.As(err, &pqErr) && string(pqErr.Code) == pgErrInvalidPassword {
|
||||
return fmt.Errorf(
|
||||
"failed to ping database: postgres rejected the configured credentials "+
|
||||
"(SQLSTATE %s — invalid_password). If you recently rotated POSTGRES_PASSWORD "+
|
||||
"on a docker-compose deploy, the postgres container's data volume still "+
|
||||
"holds the previous password: initdb seeds POSTGRES_PASSWORD into pg_authid "+
|
||||
"only on first boot of a fresh data dir, so editing the env var after that "+
|
||||
"point updates only the certctl-server container. Reset destructively with "+
|
||||
"`docker compose -f deploy/docker-compose.yml down -v && "+
|
||||
"docker compose -f deploy/docker-compose.yml up -d --build` (this DESTROYS "+
|
||||
"all data in the postgres volume), or non-destructively with "+
|
||||
"`docker compose -f deploy/docker-compose.yml exec postgres "+
|
||||
"psql -U certctl -c \"ALTER ROLE certctl PASSWORD '<new-password>';\"` "+
|
||||
"and then redeploy with the matching POSTGRES_PASSWORD. Underlying error: %w",
|
||||
pgErrInvalidPassword, err)
|
||||
}
|
||||
|
||||
return fmt.Errorf("failed to ping database: %w", err)
|
||||
}
|
||||
|
||||
// RunMigrations reads and executes SQL migration files from a directory.
|
||||
func RunMigrations(db *sql.DB, migrationsPath string) error {
|
||||
// Check if migrations directory exists
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
// Internal-package tests for db.go — covers the diagnostic dispatch in
|
||||
// wrapPingError. Lives in `package postgres` (not `postgres_test`) so it can
|
||||
// call the unexported helper directly without exposing it on the API surface.
|
||||
//
|
||||
// Sibling integration tests in this directory live in `package postgres_test`
|
||||
// (testcontainers-driven, schema-per-test). They exercise the live-DB
|
||||
// happy path; this file owns the unit-level diagnostic dispatch and runs in
|
||||
// `-short` mode without spinning up postgres.
|
||||
//
|
||||
// U-1 (P1, GitHub #10): closes the audit-flagged
|
||||
// cat-u-quickstart_postgres_password_volume_trap finding by pinning the
|
||||
// post-fix wrap-text contract for `db.Ping()` failures. Pre-U-1 every Ping
|
||||
// error was wrapped with the same opaque `"failed to ping database: %w"`,
|
||||
// so an operator who edited POSTGRES_PASSWORD after first-boot saw only
|
||||
// `pq: password authentication failed for user "certctl"` in the server
|
||||
// log with no pointer to the actual cause (postgres data dir retains the
|
||||
// initial password from first-boot initdb; subsequent boots ignore the env
|
||||
// var). Post-U-1 the SQLSTATE-28P01 path emits a multi-line diagnostic
|
||||
// pointing at the down -v / ALTER ROLE remediation; non-auth failures
|
||||
// retain the original wrap shape so verbose noise does not bleed into
|
||||
// transient connection-refused / timeout paths.
|
||||
package postgres
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/lib/pq"
|
||||
)
|
||||
|
||||
// TestWrapPingError_AuthFailureGuidance asserts the diagnostic wrap fires on
|
||||
// SQLSTATE 28P01 (invalid_password) and contains all three contract elements:
|
||||
// the SQLSTATE code (so operators can grep), the down-v destructive
|
||||
// remediation, and the ALTER ROLE non-destructive remediation. Also asserts
|
||||
// the wrap chain still satisfies errors.As(err, &*pq.Error) so callers that
|
||||
// programmatically inspect the underlying postgres error code keep working.
|
||||
func TestWrapPingError_AuthFailureGuidance(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := &pq.Error{
|
||||
Code: pq.ErrorCode("28P01"),
|
||||
Message: `password authentication failed for user "certctl"`,
|
||||
}
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
// Contract elements — the operator-facing string is what we ship.
|
||||
wantSubstrings := []string{
|
||||
"SQLSTATE 28P01", // operators grep on this
|
||||
"POSTGRES_PASSWORD", // names the variable that traps
|
||||
"first boot", // the mechanism in plain language
|
||||
"down -v", // destructive remediation
|
||||
"ALTER ROLE", // non-destructive remediation
|
||||
}
|
||||
for _, s := range wantSubstrings {
|
||||
if !strings.Contains(got, s) {
|
||||
t.Errorf("wrap text missing %q\ngot: %s", s, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap chain must still expose the underlying *pq.Error for callers
|
||||
// that want to inspect Code / Detail / Constraint fields. Pre-fix
|
||||
// callers used errors.As(err, &pqErr) on the unwrapped Ping result;
|
||||
// the new wrap is fmt.Errorf("...%w", err) so errors.As must walk it.
|
||||
var pqErr *pq.Error
|
||||
if !errors.As(wrapped, &pqErr) {
|
||||
t.Fatalf("errors.As did not extract *pq.Error from wrapped chain: %v", wrapped)
|
||||
}
|
||||
if pqErr.Code != "28P01" {
|
||||
t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "28P01")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NonAuthErrorPreservesOriginalWrap guards against the
|
||||
// guidance text bleeding into unrelated failure modes. SQLSTATE 08006
|
||||
// (connection_failure) is the canonical non-auth case — server unreachable,
|
||||
// TLS handshake failure, network drop. The wrap should be the original
|
||||
// shape so transient-error log noise does not include the (now lengthy)
|
||||
// volume-state remediation paragraph.
|
||||
func TestWrapPingError_NonAuthErrorPreservesOriginalWrap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := &pq.Error{
|
||||
Code: pq.ErrorCode("08006"),
|
||||
Message: "connection refused",
|
||||
}
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
// Original-wrap shape: prefix only, no guidance text.
|
||||
const wantPrefix = "failed to ping database: "
|
||||
if !strings.HasPrefix(got, wantPrefix) {
|
||||
t.Errorf("expected prefix %q, got: %s", wantPrefix, got)
|
||||
}
|
||||
|
||||
// Negative assertions: guidance text MUST NOT appear on non-auth paths.
|
||||
mustNotContain := []string{
|
||||
"SQLSTATE 08006", // we only call out 28P01 specifically
|
||||
"POSTGRES_PASSWORD",
|
||||
"down -v",
|
||||
"ALTER ROLE",
|
||||
}
|
||||
for _, s := range mustNotContain {
|
||||
if strings.Contains(got, s) {
|
||||
t.Errorf("non-auth wrap leaked guidance substring %q\ngot: %s", s, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap chain still walks for errors.As — same contract as auth path.
|
||||
var pqErr *pq.Error
|
||||
if !errors.As(wrapped, &pqErr) {
|
||||
t.Fatalf("errors.As did not extract *pq.Error from non-auth wrapped chain: %v", wrapped)
|
||||
}
|
||||
if pqErr.Code != "08006" {
|
||||
t.Errorf("extracted pq.Error.Code = %q, want %q", pqErr.Code, "08006")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NonPqErrorPreservesOriginalWrap guards the network-level
|
||||
// case: a pre-handshake failure (TCP refused, DNS, TLS) returns a
|
||||
// non-*pq.Error from db.Ping(). errors.As must return false, the helper
|
||||
// must fall through to the generic wrap, and the chain must remain walkable.
|
||||
func TestWrapPingError_NonPqErrorPreservesOriginalWrap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
original := errors.New("dial tcp 127.0.0.1:5432: connect: connection refused")
|
||||
|
||||
wrapped := wrapPingError(original)
|
||||
if wrapped == nil {
|
||||
t.Fatal("wrapPingError returned nil for a non-nil input")
|
||||
}
|
||||
|
||||
got := wrapped.Error()
|
||||
|
||||
const wantPrefix = "failed to ping database: "
|
||||
if !strings.HasPrefix(got, wantPrefix) {
|
||||
t.Errorf("expected prefix %q, got: %s", wantPrefix, got)
|
||||
}
|
||||
if strings.Contains(got, "SQLSTATE") || strings.Contains(got, "POSTGRES_PASSWORD") {
|
||||
t.Errorf("network-level wrap leaked SQLSTATE/postgres guidance\ngot: %s", got)
|
||||
}
|
||||
if !errors.Is(wrapped, original) {
|
||||
t.Errorf("errors.Is did not walk to original sentinel: %v", wrapped)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWrapPingError_NilReturnsNil — defensive contract: if Ping returned nil
|
||||
// (no failure), the helper must not synthesize a fake error. This isn't on
|
||||
// the documented call path (NewDB only invokes wrapPingError inside the
|
||||
// `if err != nil` branch), but pinning it prevents a future refactor from
|
||||
// regressing the contract silently.
|
||||
func TestWrapPingError_NilReturnsNil(t *testing.T) {
|
||||
t.Parallel()
|
||||
if got := wrapPingError(nil); got != nil {
|
||||
t.Errorf("wrapPingError(nil) = %v, want nil", got)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user