mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 21:51:30 +00:00
21aeed4f4e
Phase 0 closure (Path B2, post-rewrite):
addlicense sweep — adds the canonical certctl LLC copyright + BUSL-1.1
SPDX header to every production Go file. Template:
// Copyright 2026 certctl LLC. All rights reserved.
// SPDX-License-Identifier: BUSL-1.1
Coverage: 338 / 338 production Go files (cmd/ + internal/, excluding
*_test.go and **/testdata/**). Pre-sweep coverage was 22 / 338 (6.5%);
post-sweep is 338 / 338 (100%).
Normalized 22 pre-existing legacy headers (`// Copyright (c) certctl`
+ `// SPDX-License-Identifier: BSL-1.1`) and 1 file using a
`Certctl Contributors` attribution. The legacy SPDX ID `BSL-1.1`
is non-standard; the official SPDX identifier for Business Source
License 1.1 is `BUSL-1.1` (capital U). All 338 files now share the
canonical form.
Generated via:
addlicense -c "certctl LLC" -y 2026 \
-f cowork/legal/copyright-header.tpl \
-ignore '**/testdata/**' -ignore '**/*_test.go' \
cmd/ internal/
Verification:
find cmd internal -name '*.go' -not -name '*_test.go' \
-not -path '*/testdata/*' \
-exec grep -L '^// Copyright 2026 certctl LLC' {} \; | wc -l
Returns: 0
gofmt clean. Header additions are comments only, no compile impact.
Closes: cowork/certctl-architecture-diligence-audit.html#fix-RED-4
431 lines
19 KiB
Go
431 lines
19 KiB
Go
// Copyright 2026 certctl LLC. All rights reserved.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package postgres
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/lib/pq"
|
|
)
|
|
|
|
// pgErrInvalidPassword is the SQLSTATE for class 28 / code 28P01 —
|
|
// invalid_password — emitted by PostgreSQL when the client presents
|
|
// credentials that don't match pg_authid. Defined locally because the
|
|
// lib/pq package does not export named constants for SQLSTATE codes (it
|
|
// only exposes the typed string alias pq.ErrorCode and a name-lookup map
|
|
// at runtime). Pinned as a string constant rather than a pq.ErrorCode
|
|
// literal so the contract is grep-able from operator-facing log lines.
|
|
//
|
|
// Reference: https://www.postgresql.org/docs/16/errcodes-appendix.html
|
|
const pgErrInvalidPassword = "28P01"
|
|
|
|
// NewDB opens a PostgreSQL database connection and sets up connection pooling.
|
|
//
|
|
// The pool size is hard-coded here for backward compatibility with call
|
|
// sites that don't pass an explicit limit. The Bundle 3 closure (D12)
|
|
// added NewDBWithMaxConns so the operator-facing CERTCTL_DATABASE_MAX_CONNS
|
|
// env var actually flows through to db.SetMaxOpenConns; cmd/server/main.go
|
|
// uses NewDBWithMaxConns now. Idle conns are kept at MaxOpenConns/5
|
|
// (rounded up to at least 1) so the ratio behavior pre-Bundle-3 (5 idle
|
|
// out of 25 max) generalizes to any operator-supplied limit.
|
|
func NewDB(connStr string) (*sql.DB, error) {
|
|
return NewDBWithMaxConns(connStr, 25)
|
|
}
|
|
|
|
// NewDBWithMaxConns opens a PostgreSQL database connection and sets the
|
|
// connection-pool max-open + max-idle limits.
|
|
//
|
|
// Bundle 3 closure (D12): pre-Bundle-3 the pool was hard-coded to
|
|
// SetMaxOpenConns(25) regardless of the operator's
|
|
// CERTCTL_DATABASE_MAX_CONNS setting. The config field was loaded by
|
|
// internal/config/config.go (validated to be >= 1), wired into values.yaml's
|
|
// `CERTCTL_DATABASE_MAX_CONNS: "25"` example comment, AND surfaced in
|
|
// docs/reference/configuration.md as if it took effect — but the
|
|
// underlying pool ignored it. Operators tuning for higher load found
|
|
// no behavioral change.
|
|
//
|
|
// Post-Bundle-3 the operator-facing knob actually flows here. The
|
|
// idle-pool size scales with maxOpen so the historical 1:5 ratio
|
|
// (5 idle of 25 max) carries forward to any operator-supplied size.
|
|
// Floors: maxOpen guaranteed >= 1 by config.Validate; maxIdle floored
|
|
// to 1 to avoid the 0-idle pathological case where every query opens
|
|
// a fresh connection.
|
|
func NewDBWithMaxConns(connStr string, maxOpen int) (*sql.DB, error) {
|
|
if maxOpen < 1 {
|
|
maxOpen = 1
|
|
}
|
|
db, err := sql.Open("postgres", connStr)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to open database: %w", err)
|
|
}
|
|
|
|
// Configure connection pool.
|
|
db.SetMaxOpenConns(maxOpen)
|
|
maxIdle := maxOpen / 5
|
|
if maxIdle < 1 {
|
|
maxIdle = 1
|
|
}
|
|
db.SetMaxIdleConns(maxIdle)
|
|
|
|
// Ping to verify connection.
|
|
if err := db.Ping(); err != nil {
|
|
return nil, wrapPingError(err)
|
|
}
|
|
|
|
return db, nil
|
|
}
|
|
|
|
// wrapPingError converts a db.Ping() failure into an operator-friendly
|
|
// diagnostic. The default wrap is the original opaque
|
|
// `"failed to ping database: <inner>"` shape. The exception is SQLSTATE 28P01
|
|
// (invalid_password): when postgres rejects the server's credentials we emit
|
|
// extended guidance that names the most common operator misstep — editing
|
|
// POSTGRES_PASSWORD in `.env` after the postgres named volume has already
|
|
// been initialized — and lists both the destructive (`docker compose down -v`)
|
|
// and non-destructive (`ALTER ROLE`) remediations.
|
|
//
|
|
// U-1 (P1, GitHub #10): closes the audit-flagged
|
|
// cat-u-quickstart_postgres_password_volume_trap finding. The postgres
|
|
// docker-entrypoint runs initdb only when /var/lib/postgresql/data is empty;
|
|
// on subsequent boots the password baked into pg_authid on first boot wins
|
|
// over whatever the env var carries, so the env-vs-pg_authid divergence is
|
|
// intrinsic to how the postgres image bootstraps and cannot be fixed by us
|
|
// upstream of pg_authid. The ergonomic answer is to surface a clear
|
|
// diagnostic at the failure site so operators don't waste an hour on
|
|
// "is my password right" before discovering the volume needs to be torn
|
|
// down (or the role's password rotated in-place).
|
|
//
|
|
// The wrap chain is preserved via fmt.Errorf("%w", err) so callers using
|
|
// errors.As(err, &*pq.Error) on the returned value continue to work; this
|
|
// matches the audit's "no substring matching on err.Error()" requirement
|
|
// from the M-1 sentinel-error migration.
|
|
//
|
|
// Returns nil when err is nil so callers can defensively pipe through this
|
|
// helper without an extra branch.
|
|
func wrapPingError(err error) error {
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
var pqErr *pq.Error
|
|
if errors.As(err, &pqErr) && string(pqErr.Code) == pgErrInvalidPassword {
|
|
return fmt.Errorf(
|
|
"failed to ping database: postgres rejected the configured credentials "+
|
|
"(SQLSTATE %s — invalid_password). If you recently rotated POSTGRES_PASSWORD "+
|
|
"on a docker-compose deploy, the postgres container's data volume still "+
|
|
"holds the previous password: initdb seeds POSTGRES_PASSWORD into pg_authid "+
|
|
"only on first boot of a fresh data dir, so editing the env var after that "+
|
|
"point updates only the certctl-server container. Reset destructively with "+
|
|
"`docker compose -f deploy/docker-compose.yml down -v && "+
|
|
"docker compose -f deploy/docker-compose.yml up -d --build` (this DESTROYS "+
|
|
"all data in the postgres volume), or non-destructively with "+
|
|
"`docker compose -f deploy/docker-compose.yml exec postgres "+
|
|
"psql -U certctl -c \"ALTER ROLE certctl PASSWORD '<new-password>';\"` "+
|
|
"and then redeploy with the matching POSTGRES_PASSWORD. Underlying error: %w",
|
|
pgErrInvalidPassword, err)
|
|
}
|
|
|
|
return fmt.Errorf("failed to ping database: %w", err)
|
|
}
|
|
|
|
// migrationAdvisoryLockID is the Postgres pg_advisory_lock key that
|
|
// gates concurrent migration execution. Computed as a stable hash of
|
|
// the literal string "certctl-migrations" so the same constant resolves
|
|
// across deployments without colliding with operator-supplied advisory
|
|
// locks for other workloads on the same database.
|
|
//
|
|
// Bundle 4 closure (HIGH-1 + D4 + finding 4, 2026-05-13): pre-Bundle-4
|
|
// `RunMigrations` re-executed every `.up.sql` file on every server
|
|
// boot with no concurrency control. The "idempotency via IF NOT EXISTS
|
|
// / ON CONFLICT" contract in the CLAUDE.md operating rules held for
|
|
// every individual migration, but offered no protection when two
|
|
// server replicas executed RunMigrations against the same database
|
|
// simultaneously (the Helm chart's `server.replicas > 1` HA path):
|
|
// duplicate DDL races could still produce SQLSTATE 42P07 (relation
|
|
// already exists) under specific schema-modification interleavings,
|
|
// and any future migration that drifted from the IF-NOT-EXISTS
|
|
// pattern would race-corrupt without warning.
|
|
//
|
|
// Post-Bundle-4 RunMigrations acquires this advisory lock on a
|
|
// dedicated connection before scanning the directory; runner-up
|
|
// replicas block at pg_advisory_lock() until the migrator finishes,
|
|
// then observe the populated schema_migrations table and skip every
|
|
// already-applied file. Single-replica deploys see no behavior change
|
|
// beyond the schema_migrations audit trail and the lock acquire/
|
|
// release log lines.
|
|
const migrationAdvisoryLockID int64 = 7283164759461502341
|
|
|
|
// migrationsTableDDL creates the audit-trail table that records which
|
|
// migration files have already been applied. Idempotent: subsequent
|
|
// boots see the table, scan its contents, and skip any .up.sql whose
|
|
// version (the filename, e.g. "000001_initial_schema.up.sql") is
|
|
// already present.
|
|
//
|
|
// The version column intentionally stores the FULL filename rather
|
|
// than a parsed version-number prefix. Migrations have always been
|
|
// filename-keyed; storing the raw name keeps the lookup mechanical
|
|
// and survives any future renames (a hash-based fingerprint would
|
|
// re-run every migration on rename, an explicit numeric version
|
|
// breaks if a migration is ever renamed without renumbering).
|
|
const migrationsTableDDL = `
|
|
CREATE TABLE IF NOT EXISTS schema_migrations (
|
|
version TEXT PRIMARY KEY,
|
|
applied_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
)`
|
|
|
|
// RunMigrations reads and executes SQL migration files from a directory.
|
|
//
|
|
// Bundle 4 closure: now wraps every migration execution in:
|
|
//
|
|
// 1. A `pg_advisory_lock(migrationAdvisoryLockID)` so only one
|
|
// server replica applies migrations at a time. Other replicas
|
|
// block at acquire and proceed once migrations finish.
|
|
// 2. A `schema_migrations` audit table so each subsequent boot
|
|
// skips already-applied files instead of re-executing them.
|
|
// This converts the implicit "idempotency via IF NOT EXISTS"
|
|
// contract into an explicit applied-versions ledger an operator
|
|
// or acquirer can audit with `SELECT * FROM schema_migrations
|
|
// ORDER BY applied_at`.
|
|
//
|
|
// Single-replica deploys see zero behavior change beyond the new
|
|
// audit-table population. Multi-replica deploys (Helm
|
|
// `server.replicas > 1`) get HA-safe schema bootstrap.
|
|
//
|
|
// The advisory lock and audit table are managed through a dedicated
|
|
// *sql.Conn so the lock survives even if the underlying pool rotates
|
|
// connections; the deferred unlock is `LOCAL` to that connection.
|
|
// Failure modes:
|
|
//
|
|
// - Migrations directory missing: returned unchanged from the
|
|
// pre-Bundle-4 error path so existing callers ergonomics hold.
|
|
// - Lock acquire fails (network / shutdown): wrapped with context
|
|
// so the operator-visible error names the wrapping operation.
|
|
// - Individual migration fails: aborts with the same %s wrap as
|
|
// pre-Bundle-4, leaving schema_migrations untouched for that
|
|
// migration so a retry rerun reattempts only the failing file.
|
|
func RunMigrations(db *sql.DB, migrationsPath string) error {
|
|
// Check if migrations directory exists.
|
|
if _, err := os.Stat(migrationsPath); os.IsNotExist(err) {
|
|
return fmt.Errorf("migrations directory not found: %s", migrationsPath)
|
|
}
|
|
|
|
// Read all SQL files from the migrations directory.
|
|
files, err := os.ReadDir(migrationsPath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read migrations directory: %w", err)
|
|
}
|
|
|
|
// Sort and filter to only .up.sql migration files (skip .down.sql
|
|
// rollbacks and seed files). os.ReadDir returns entries in lexical
|
|
// order which matches the migration file naming convention
|
|
// (000001_, 000002_, ..., 000045_).
|
|
var sqlFiles []string
|
|
for _, file := range files {
|
|
if !file.IsDir() && strings.HasSuffix(file.Name(), ".up.sql") {
|
|
sqlFiles = append(sqlFiles, file.Name())
|
|
}
|
|
}
|
|
|
|
// Pin to a single connection for the lock-table-migrations
|
|
// lifecycle. pg_advisory_lock is connection-scoped: releasing it
|
|
// requires the SAME connection that acquired it. Using a pooled
|
|
// db.Exec for the unlock could land on a different connection and
|
|
// silently fail to release.
|
|
ctx := context.Background()
|
|
conn, err := db.Conn(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to acquire migration connection: %w", err)
|
|
}
|
|
defer conn.Close()
|
|
|
|
// Bundle 4: acquire the cross-replica advisory lock. Blocks until
|
|
// acquired; postgres queues the second replica behind the first.
|
|
if _, err := conn.ExecContext(ctx, `SELECT pg_advisory_lock($1)`, migrationAdvisoryLockID); err != nil {
|
|
return fmt.Errorf("failed to acquire migration advisory lock: %w", err)
|
|
}
|
|
defer func() {
|
|
// Best-effort release on the same connection. The connection
|
|
// will close anyway via the defer above, which also releases
|
|
// the lock at the postgres backend.
|
|
_, _ = conn.ExecContext(ctx, `SELECT pg_advisory_unlock($1)`, migrationAdvisoryLockID)
|
|
}()
|
|
|
|
// Bundle 4: ensure the audit table exists before consulting it.
|
|
// IF NOT EXISTS keeps this idempotent on every boot.
|
|
if _, err := conn.ExecContext(ctx, migrationsTableDDL); err != nil {
|
|
return fmt.Errorf("failed to ensure schema_migrations table: %w", err)
|
|
}
|
|
|
|
// Read the set of already-applied versions. We hold the advisory
|
|
// lock through the entire SELECT → apply → INSERT cycle, so no
|
|
// other replica can race-INSERT a duplicate row between this read
|
|
// and our subsequent writes.
|
|
applied := make(map[string]struct{})
|
|
rows, err := conn.QueryContext(ctx, `SELECT version FROM schema_migrations`)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read schema_migrations: %w", err)
|
|
}
|
|
for rows.Next() {
|
|
var v string
|
|
if err := rows.Scan(&v); err != nil {
|
|
rows.Close()
|
|
return fmt.Errorf("failed to scan schema_migrations row: %w", err)
|
|
}
|
|
applied[v] = struct{}{}
|
|
}
|
|
if err := rows.Err(); err != nil {
|
|
return fmt.Errorf("schema_migrations scan error: %w", err)
|
|
}
|
|
rows.Close()
|
|
|
|
// Execute each migration file in order, skipping already-applied
|
|
// versions.
|
|
for _, filename := range sqlFiles {
|
|
if _, ok := applied[filename]; ok {
|
|
// Already applied on a prior boot; skip.
|
|
continue
|
|
}
|
|
filePath := filepath.Join(migrationsPath, filename)
|
|
content, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read migration file %s: %w", filename, err)
|
|
}
|
|
|
|
// Execute the SQL content on the locked connection so any
|
|
// session-scoped state set by the migration (e.g. SET ROLE)
|
|
// stays within the migration's session.
|
|
if _, err := conn.ExecContext(ctx, string(content)); err != nil {
|
|
return fmt.Errorf("failed to execute migration %s: %w", filename, err)
|
|
}
|
|
|
|
// Record the applied version. ON CONFLICT (version) DO NOTHING
|
|
// is defense-in-depth: even though we hold the advisory lock
|
|
// (so no concurrent migrator should be writing here), a stale
|
|
// `schema_migrations` row from a half-rolled-back deploy
|
|
// shouldn't crash the boot.
|
|
if _, err := conn.ExecContext(ctx,
|
|
`INSERT INTO schema_migrations (version) VALUES ($1) ON CONFLICT (version) DO NOTHING`,
|
|
filename,
|
|
); err != nil {
|
|
return fmt.Errorf("failed to record applied migration %s in schema_migrations: %w", filename, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// RunSeed reads and executes the baseline seed SQL file from the migrations
|
|
// directory. Designed to run AFTER RunMigrations so every column referenced by
|
|
// the seed is already in place.
|
|
//
|
|
// U-3 (P1, cat-u-seed_initdb_schema_drift): pre-U-3 the deploy compose stack
|
|
// mounted both a hand-curated subset of `migrations/*.up.sql` and `seed.sql`
|
|
// into postgres `/docker-entrypoint-initdb.d/`. Postgres applied them at
|
|
// initdb time. When `seed.sql` was updated to reference columns added by
|
|
// migrations *after* the mounted cutoff (e.g., `policy_rules.severity` from
|
|
// `000013_policy_rule_severity.up.sql`), initdb crashed during the seed step
|
|
// and the container was reported `unhealthy` indefinitely — bare
|
|
// `docker compose -f deploy/docker-compose.yml up -d --build` from a fresh
|
|
// clone of v2.0.50 hit this on the first try (GitHub #10 reopened by
|
|
// mikeakasully). Helm and the example compose files were already runtime-
|
|
// only (Path B) and worked through the same window.
|
|
//
|
|
// Post-U-3 the compose stack drops all initdb mounts; postgres comes up with
|
|
// an empty schema; the server applies all migrations via RunMigrations and
|
|
// then this function applies the seed. Single source of truth, removes the
|
|
// drift hazard architecturally.
|
|
//
|
|
// The seed file is expected at `<migrationsPath>/seed.sql`. Missing-file is
|
|
// treated as a no-op (returns nil) so deployments that explicitly remove the
|
|
// seed (custom packaging, cert-manager managed schemas) don't break.
|
|
//
|
|
// Idempotency: every INSERT in the shipped seed.sql uses
|
|
// `ON CONFLICT (id) DO NOTHING`, so re-running on a populated DB is safe.
|
|
// This function is invoked on every server start, so the contract MUST hold.
|
|
//
|
|
// Demo seed: `seed_demo.sql` is applied separately by RunDemoSeed below
|
|
// when CERTCTL_DEMO_SEED=true (see internal/config/config.go::DemoSeed).
|
|
// Splitting demo from baseline keeps a default deploy from accidentally
|
|
// landing 90-days-of-fake-history into a real customer database, while
|
|
// still giving the demo overlay a single source of truth (no more initdb
|
|
// mounts). The demo seed itself uses ON CONFLICT (id) DO NOTHING so it's
|
|
// idempotent; missing-file is also tolerated (custom packaging may strip
|
|
// seed_demo.sql to shrink the image).
|
|
func RunSeed(db *sql.DB, migrationsPath string) error {
|
|
if _, err := os.Stat(migrationsPath); os.IsNotExist(err) {
|
|
return fmt.Errorf("migrations directory not found: %s", migrationsPath)
|
|
}
|
|
|
|
seedPath := filepath.Join(migrationsPath, "seed.sql")
|
|
content, err := os.ReadFile(seedPath)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
// Missing seed.sql is acceptable — operators may have removed it
|
|
// for custom-packaging reasons. Return nil rather than fail-loud.
|
|
return nil
|
|
}
|
|
return fmt.Errorf("failed to read seed file %s: %w", seedPath, err)
|
|
}
|
|
|
|
if _, err := db.Exec(string(content)); err != nil {
|
|
return fmt.Errorf("failed to execute seed file %s: %w", seedPath, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// RunDemoSeed applies the demo overlay seed file
|
|
// (`<migrationsPath>/seed_demo.sql`) on top of the baseline seed.
|
|
//
|
|
// U-3 follow-on: pre-U-3 the demo overlay mounted `seed_demo.sql` into
|
|
// postgres `/docker-entrypoint-initdb.d/` and relied on initdb to apply it
|
|
// alongside the schema. Once U-3 dropped the initdb migration mounts, that
|
|
// path stopped working — postgres comes up empty, and the demo seed
|
|
// references tables (issuers, certificates, etc.) that wouldn't exist yet
|
|
// at initdb time. RunDemoSeed restores the demo capability through the
|
|
// same runtime path RunSeed uses, gated by CERTCTL_DEMO_SEED so production
|
|
// deploys never accidentally land the fake-history rows.
|
|
//
|
|
// Order contract: must run AFTER RunSeed so foreign-key references from
|
|
// demo rows to baseline rows (e.g., demo certificates referencing
|
|
// `rp-default` from baseline) resolve cleanly. The caller in
|
|
// cmd/server/main.go enforces this order.
|
|
//
|
|
// Missing-file is acceptable (returns nil) — operators packaging a
|
|
// production-only image often strip seed_demo.sql to shrink the artifact,
|
|
// and that should not break boot when CERTCTL_DEMO_SEED happens to be set.
|
|
//
|
|
// Idempotency: every INSERT in seed_demo.sql uses
|
|
// `ON CONFLICT (id) DO NOTHING`, so re-running on a populated DB is safe.
|
|
// Server restarts in demo mode therefore re-apply the file harmlessly.
|
|
func RunDemoSeed(db *sql.DB, migrationsPath string) error {
|
|
if _, err := os.Stat(migrationsPath); os.IsNotExist(err) {
|
|
return fmt.Errorf("migrations directory not found: %s", migrationsPath)
|
|
}
|
|
|
|
seedPath := filepath.Join(migrationsPath, "seed_demo.sql")
|
|
content, err := os.ReadFile(seedPath)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
// Custom production packaging frequently strips this file.
|
|
// Fail-soft to preserve the U-3 contract: a missing seed file
|
|
// must not gate server boot.
|
|
return nil
|
|
}
|
|
return fmt.Errorf("failed to read demo seed file %s: %w", seedPath, err)
|
|
}
|
|
|
|
if _, err := db.Exec(string(content)); err != nil {
|
|
return fmt.Errorf("failed to execute demo seed file %s: %w", seedPath, err)
|
|
}
|
|
|
|
return nil
|
|
}
|