diff --git a/internal/deploy/apply.go b/internal/deploy/apply.go new file mode 100644 index 0000000..c66df30 --- /dev/null +++ b/internal/deploy/apply.go @@ -0,0 +1,327 @@ +package deploy + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "sort" + "time" +) + +// Apply executes plan as one atomic deployment. See package doc and +// the Plan-type comments for the full algorithm contract; the +// summary: +// +// 1. Validate the plan shape (no empty paths, no dupes). +// 2. Per-file SHA-256 check; if every file already has identical +// bytes and !plan.SkipIdempotent, return early with +// SkippedAsIdempotent=true. +// 3. Lock every file path in the plan (sorted to avoid deadlocks +// when two concurrent Applies share some paths). +// 4. Backup every existing destination. +// 5. Write every file to its sibling .certctl-tmp.; +// apply ownership (chmod + chown) to each temp. +// 6. Call PreCommit(ctx, tempPaths). On error: clean up all temp +// files; backups stay (operator may want to restore manually). +// Return ErrValidateFailed. +// 7. os.Rename every temp → final, in plan-order. We don't try to +// "rollback" a partial rename mid-loop — we trust os.Rename to +// either succeed or fail-fast within the same filesystem; if a +// mid-loop rename fails, we attempt rollback of the renames +// that already succeeded. +// 8. Call PostCommit(ctx). On success: prune old backups; return. +// 9. On PostCommit error: restore each File from its backup; +// re-call PostCommit. If second PostCommit also fails, return +// ErrRollbackFailed (operator-actionable; deploy is in known- +// bad state). +// +// The PreCommit/PostCommit hooks may be nil; nil = "no-op step". +func Apply(ctx context.Context, plan Plan) (*Result, error) { + start := time.Now() + + if err := validatePlan(plan); err != nil { + return nil, err + } + + // Lock every path in sorted order to defend against the + // classic AB/BA deadlock when two concurrent Applies overlap + // in their file sets. + absPaths := make([]string, len(plan.Files)) + for i, f := range plan.Files { + abs, err := filepath.Abs(f.Path) + if err != nil { + return nil, fmt.Errorf("resolve path %s: %w", f.Path, err) + } + absPaths[i] = abs + } + sortedPaths := append([]string(nil), absPaths...) + sort.Strings(sortedPaths) + unlocks := make([]func(), 0, len(sortedPaths)) + defer func() { + // Release in reverse order. Standard mutex hygiene. + for i := len(unlocks) - 1; i >= 0; i-- { + unlocks[i]() + } + }() + for _, p := range sortedPaths { + unlocks = append(unlocks, lockFile(p)) + } + + if err := ctx.Err(); err != nil { + return nil, err + } + + res := &Result{ + BackupPaths: make(map[string]string, len(plan.Files)), + } + + // 2. Idempotency short-circuit. + if !plan.SkipIdempotent { + allMatch := true + for i, f := range plan.Files { + abs := absPaths[i] + existing, err := os.ReadFile(abs) + if err != nil { + allMatch = false + break + } + if !sha256Eq(existing, f.Bytes) { + allMatch = false + break + } + } + if allMatch { + res.SkippedAsIdempotent = true + res.Duration = time.Since(start) + return res, nil + } + } + + // 3. For each file: stat existing, resolve ownership, prep + // the per-file work plan. + preps := make([]*filePrep, len(plan.Files)) + for i, f := range plan.Files { + abs := absPaths[i] + stat, statErr := os.Stat(abs) + existed := statErr == nil + owner, err := resolveOwnership(f, plan.Defaults, ownershipStat(stat, statErr)) + if err != nil { + return nil, fmt.Errorf("file %d (%s): resolve ownership: %w", i, abs, err) + } + preps[i] = &filePrep{ + abs: abs, + file: f, + owner: owner, + hadOrig: existed, + } + } + + // 4. Backup every existing destination BEFORE writing any + // temp file. If any backup fails, abort with no on-disk + // changes to live files. + if plan.BackupRetention != -1 { + for _, p := range preps { + if !p.hadOrig { + res.BackupPaths[p.abs] = "" + continue + } + backupPath, err := backupFile(p.abs) + if err != nil { + // Clean up any backups already taken. + cleanupBackups(res.BackupPaths) + return nil, fmt.Errorf("backup %s: %w", p.abs, err) + } + p.backupTo = backupPath + res.BackupPaths[p.abs] = backupPath + } + } + + // 5. Write every file to a sibling temp + apply ownership. + tempPaths := make(map[string]string, len(preps)) + cleanupTemps := func() { + for _, p := range preps { + if p.tempPath != "" { + _ = os.Remove(p.tempPath) + } + } + } + for _, p := range preps { + tempPath, err := writeTempFile(p.abs, p.file.Bytes) + if err != nil { + cleanupTemps() + return nil, fmt.Errorf("write temp for %s: %w", p.abs, err) + } + p.tempPath = tempPath + tempPaths[p.abs] = tempPath + if err := applyOwnership(tempPath, p.owner); err != nil { + cleanupTemps() + return nil, fmt.Errorf("apply ownership to temp for %s: %w", p.abs, err) + } + } + + // 6. PreCommit (validate-with-the-target). + if plan.PreCommit != nil { + if err := plan.PreCommit(ctx, tempPaths); err != nil { + cleanupTemps() + return nil, fmt.Errorf("%w: %v", ErrValidateFailed, err) + } + } + res.ValidateOK = true + + // 7. Atomic rename each temp → final. If a mid-loop rename + // fails, attempt to restore the renames that already + // succeeded (a degraded form of rollback — better than + // leaving a half-deployed state). + doneRenames := make([]*filePrep, 0, len(preps)) + for _, p := range preps { + if err := os.Rename(p.tempPath, p.abs); err != nil { + // Mid-loop rename failure. Roll back what we did. + rollbackErr := restoreFromBackups(doneRenames) + cleanupTemps() + if rollbackErr != nil { + return res, fmt.Errorf("%w: rename %s mid-loop, rollback also failed: %v (rename: %v)", ErrRollbackFailed, p.abs, rollbackErr, err) + } + return res, fmt.Errorf("rename %s: %w", p.abs, err) + } + doneRenames = append(doneRenames, p) + } + + // 8. PostCommit (reload). + if plan.PostCommit != nil { + if err := plan.PostCommit(ctx); err != nil { + // Rollback: restore + re-PostCommit. + rollbackErr := restoreFromBackups(preps) + if rollbackErr != nil { + res.Duration = time.Since(start) + return res, fmt.Errorf("%w: PostCommit failed (%v) AND rollback restore failed (%v)", ErrRollbackFailed, err, rollbackErr) + } + // Restore succeeded; re-call PostCommit against the + // previous bytes. This is the second PostCommit; if + // IT also fails, we're in operator-actionable state. + if err2 := plan.PostCommit(ctx); err2 != nil { + res.Duration = time.Since(start) + return res, fmt.Errorf("%w: PostCommit failed (%v) AND second PostCommit after restore also failed (%v)", ErrRollbackFailed, err, err2) + } + res.RolledBack = true + res.Duration = time.Since(start) + return res, fmt.Errorf("%w: %v", ErrReloadFailed, err) + } + } + res.Reloaded = true + + // 9. Janitor: prune backups beyond retention. + retention := plan.BackupRetention + if retention == 0 { + retention = DefaultBackupRetention + } + if retention > 0 { + for _, p := range preps { + _ = pruneBackups(p.abs, retention) + } + } + + res.Duration = time.Since(start) + return res, nil +} + +// validatePlan rejects malformed plans before any I/O. +func validatePlan(plan Plan) error { + if len(plan.Files) == 0 { + return fmt.Errorf("%w: no files", ErrPlanInvalid) + } + seen := make(map[string]struct{}, len(plan.Files)) + for i, f := range plan.Files { + if f.Path == "" { + return fmt.Errorf("%w: file %d has empty path", ErrPlanInvalid, i) + } + abs, err := filepath.Abs(f.Path) + if err != nil { + return fmt.Errorf("%w: file %d (%s): %v", ErrPlanInvalid, i, f.Path, err) + } + if _, dup := seen[abs]; dup { + return fmt.Errorf("%w: duplicate destination %s", ErrPlanInvalid, abs) + } + seen[abs] = struct{}{} + } + return nil +} + +// filePrep is the per-file working state for one Apply call. +// Held by Apply's slice; passed to restoreFromBackups during +// rollback. +type filePrep struct { + abs string + file File + tempPath string + owner resolvedOwnership + hadOrig bool + backupTo string +} + +// restoreFromBackups copies each prep's backup back into place. +// Used during rollback (PostCommit failure or mid-loop rename +// failure). +func restoreFromBackups(preps []*filePrep) error { + var firstErr error + for _, p := range preps { + if p.backupTo == "" { + // File didn't exist before deploy — restore = remove. + if err := os.Remove(p.abs); err != nil && !errors.Is(err, os.ErrNotExist) { + if firstErr == nil { + firstErr = err + } + } + continue + } + // Read backup; atomically rewrite destination via the + // same temp + rename dance so this restore is itself + // atomic. We DON'T call AtomicWriteFile because we want + // to skip the per-file mutex (we already hold it from + // the outer Apply) and skip the backup-of-the-restore + // (we don't want a backup chain explosion). + bytes, err := os.ReadFile(p.backupTo) + if err != nil { + if firstErr == nil { + firstErr = fmt.Errorf("read backup %s: %w", p.backupTo, err) + } + continue + } + tempPath, err := writeTempFile(p.abs, bytes) + if err != nil { + if firstErr == nil { + firstErr = fmt.Errorf("write restore temp for %s: %w", p.abs, err) + } + continue + } + // Reapply original ownership (preserved from existing + // stat at prep time). + if err := applyOwnership(tempPath, p.owner); err != nil { + _ = os.Remove(tempPath) + if firstErr == nil { + firstErr = fmt.Errorf("apply ownership during restore for %s: %w", p.abs, err) + } + continue + } + if err := os.Rename(tempPath, p.abs); err != nil { + _ = os.Remove(tempPath) + if firstErr == nil { + firstErr = fmt.Errorf("rename during restore for %s: %w", p.abs, err) + } + continue + } + } + return firstErr +} + +// cleanupBackups removes a partial set of backups. Used when an +// early backup step fails — we want to leave the destination +// directory clean. +func cleanupBackups(backupPaths map[string]string) { + for _, bp := range backupPaths { + if bp != "" { + _ = os.Remove(bp) + } + } +} diff --git a/internal/deploy/atomic.go b/internal/deploy/atomic.go new file mode 100644 index 0000000..27befd1 --- /dev/null +++ b/internal/deploy/atomic.go @@ -0,0 +1,298 @@ +package deploy + +import ( + "context" + "crypto/sha256" + "errors" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "time" +) + +// fileMutexes serializes concurrent Apply / AtomicWriteFile calls +// against the same destination path. Coarse-grained file-level lock +// — sufficient for cert deploy throughput (operator-grade tens per +// minute, not high-throughput). +// +// Per-target serialization (Phase 2) is a separate concern at the +// agent dispatch layer; this file-level lock defends against +// accidental same-path racing within a single connector pipeline. +var fileMutexes sync.Map // map[string]*sync.Mutex + +func lockFile(path string) func() { + abs, err := filepath.Abs(path) + if err != nil { + abs = path + } + v, _ := fileMutexes.LoadOrStore(abs, &sync.Mutex{}) + mu := v.(*sync.Mutex) + mu.Lock() + return mu.Unlock +} + +// AtomicWriteFile writes data to path atomically. +// +// Algorithm: +// +// 1. Acquire the package-internal file-level mutex for path. +// 2. SHA-256 short-circuit: if path exists and has identical bytes +// and !opts.SkipIdempotent, return WriteResult{Idempotent: true} +// with no I/O. +// 3. Resolve final ownership (mode/uid/gid) per the precedence in +// resolveOwnership. +// 4. Write to .certctl-tmp. in filepath.Dir(path) +// (same-filesystem guarantees os.Rename atomicity). +// 5. fsync the temp file (durability across power loss). +// 6. Apply chmod / chown to the temp file BEFORE rename (so the +// atomic-rename atomically swaps in a fully-permissioned file). +// 7. Backup the existing destination to +// .certctl-bak. (skipped when destination did +// not exist OR opts.BackupRetention == -1). +// 8. os.Rename(temp, path) — atomic on POSIX same-filesystem. +// 9. Janitor pass: prune backups beyond retention. +// +// Returns ErrPlanInvalid for malformed inputs (empty path, empty +// data + nil-with-existing-file ambiguity is preserved — empty +// data writes an empty file). +func AtomicWriteFile(ctx context.Context, path string, data []byte, opts WriteOptions) (*WriteResult, error) { + if path == "" { + return nil, fmt.Errorf("%w: empty path", ErrPlanInvalid) + } + abs, err := filepath.Abs(path) + if err != nil { + return nil, fmt.Errorf("resolve path: %w", err) + } + + unlock := lockFile(abs) + defer unlock() + + if err := ctx.Err(); err != nil { + return nil, err + } + + res := &WriteResult{Path: abs} + + // 2. Idempotency check. + existingStat, statErr := os.Stat(abs) + existed := statErr == nil + if existed && !opts.SkipIdempotent { + existingBytes, err := os.ReadFile(abs) + if err == nil && sha256Eq(existingBytes, data) { + res.Idempotent = true + return res, nil + } + } + + // 3. Resolve ownership. + owner, err := resolveOwnership(File{ + Path: abs, + Bytes: data, + Mode: opts.Mode, + Owner: opts.Owner, + Group: opts.Group, + }, FileDefaults{ + Mode: opts.DefaultMode, + Owner: opts.DefaultOwner, + Group: opts.DefaultGroup, + }, ownershipStat(existingStat, statErr)) + if err != nil { + return nil, fmt.Errorf("resolve ownership: %w", err) + } + + // 4. Write to temp in same dir. + tempPath, err := writeTempFile(abs, data) + if err != nil { + return nil, fmt.Errorf("write temp: %w", err) + } + tempCleanup := func() { _ = os.Remove(tempPath) } + defer func() { + // On any error path we want to remove the temp file. Successful + // rename moves it away, so this remove is a no-op on success. + // We don't care about the error from the cleanup. + tempCleanup() + }() + + // 5. Apply ownership to temp BEFORE rename so the rename + // atomically swaps in a properly-permissioned file (no + // brief window where the destination has wrong perms). + if err := applyOwnership(tempPath, owner); err != nil { + return nil, fmt.Errorf("apply ownership to temp: %w", err) + } + + // 6. Backup existing destination. + if existed && opts.BackupRetention != -1 { + backupPath, err := backupFile(abs) + if err != nil { + return nil, fmt.Errorf("backup existing: %w", err) + } + res.BackupPath = backupPath + } + + // 7. Atomic rename. On the rare case Rename fails after backup, + // we leave the backup in place (operator can manually restore). + if err := os.Rename(tempPath, abs); err != nil { + return nil, fmt.Errorf("atomic rename: %w", err) + } + res.Replaced = existed + + // 8. Janitor: prune backups beyond retention. + retention := opts.BackupRetention + if retention == 0 { + retention = DefaultBackupRetention + } + if retention > 0 { + if err := pruneBackups(abs, retention); err != nil { + // Janitor errors are non-fatal — the deploy succeeded. + // Surface only if the caller wired a logger somewhere + // upstream. We choose to swallow and continue. + _ = err + } + } + + return res, nil +} + +// ownershipStat returns nil when the destination didn't exist, +// otherwise the os.FileInfo. Encapsulates the existed/not-existed +// branch so resolveOwnership's signature stays clean. +func ownershipStat(fi os.FileInfo, statErr error) os.FileInfo { + if statErr != nil { + if errors.Is(statErr, os.ErrNotExist) { + return nil + } + } + return fi +} + +// writeTempFile writes data to .certctl-tmp. in +// the same directory as abs. Returns the temp path. fsync's the +// file before close to defend against power-loss-during-rename +// corruption (rename guarantees atomic visibility but the file's +// data blocks must be on disk first). +func writeTempFile(abs string, data []byte) (string, error) { + dir := filepath.Dir(abs) + base := filepath.Base(abs) + tempName := base + TempSuffix + nowNanosStr() + tempPath := filepath.Join(dir, tempName) + + // O_WRONLY|O_CREATE|O_EXCL guarantees we don't clobber a + // half-written temp from a concurrent AtomicWriteFile call. + // fileMutexes already serialize same-abs callers; O_EXCL is + // belt-and-braces for the "wow, monotonic clock collided" + // corner case. + f, err := os.OpenFile(tempPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0600) + if err != nil { + return "", err + } + if _, err := f.Write(data); err != nil { + _ = f.Close() + _ = os.Remove(tempPath) + return "", err + } + // fsync defends against power-loss between rename + data flush. + // On POSIX, rename's atomicity is metadata-only — the new file's + // data must be on disk first or a power-loss-then-recover sees + // an empty file at the destination. + if err := f.Sync(); err != nil { + _ = f.Close() + _ = os.Remove(tempPath) + return "", err + } + if err := f.Close(); err != nil { + _ = os.Remove(tempPath) + return "", err + } + return tempPath, nil +} + +// backupFile copies abs's current bytes to +// .certctl-bak.. Used by AtomicWriteFile as a +// pre-write snapshot for rollback. +func backupFile(abs string) (string, error) { + src, err := os.ReadFile(abs) + if err != nil { + return "", fmt.Errorf("read for backup: %w", err) + } + srcStat, err := os.Stat(abs) + if err != nil { + return "", fmt.Errorf("stat for backup: %w", err) + } + dir := filepath.Dir(abs) + base := filepath.Base(abs) + backupName := base + BackupSuffix + nowNanosStr() + backupPath := filepath.Join(dir, backupName) + if err := os.WriteFile(backupPath, src, srcStat.Mode().Perm()); err != nil { + return "", fmt.Errorf("write backup: %w", err) + } + // Best-effort: preserve uid/gid of the original. The backup is + // for emergency restore; if we can't chown (non-root + chown + // denied), the operator can still cat/diff it as the agent user. + if uid, gid, ok := unixOwnerFromStat(srcStat); ok { + _ = os.Chown(backupPath, uid, gid) + } + return backupPath, nil +} + +// pruneBackups deletes older backups for abs, keeping the most +// recent `keep` entries. Sorted lexicographically — which is also +// chronological because nowNanosStr is monotonic-ish. +func pruneBackups(abs string, keep int) error { + if keep <= 0 { + return nil + } + dir := filepath.Dir(abs) + base := filepath.Base(abs) + prefix := base + BackupSuffix + entries, err := os.ReadDir(dir) + if err != nil { + return err + } + var matches []string + for _, e := range entries { + if e.IsDir() { + continue + } + if strings.HasPrefix(e.Name(), prefix) { + matches = append(matches, e.Name()) + } + } + if len(matches) <= keep { + return nil + } + sort.Strings(matches) + // Older ones come first; trim to keep the last `keep`. + toRemove := matches[:len(matches)-keep] + var firstErr error + for _, name := range toRemove { + if err := os.Remove(filepath.Join(dir, name)); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +// sha256Eq returns true when two byte slices have identical +// SHA-256 hashes. We compute both side hashes (rather than +// bytes.Equal directly) because the call sites typically already +// have a "hash for the wire" need elsewhere — keeping the same +// primitive everywhere makes future audit-log entries consistent. +func sha256Eq(a, b []byte) bool { + if len(a) != len(b) { + return false + } + ha := sha256.Sum256(a) + hb := sha256.Sum256(b) + return ha == hb +} + +// nowNanosStr returns time.Now().UnixNano() formatted as a +// fixed-width zero-padded decimal so lexicographic sort matches +// chronological order. The padding matters for pruneBackups — +// without it, "100" would sort before "99". +func nowNanosStr() string { + return fmt.Sprintf("%019d", time.Now().UnixNano()) +} diff --git a/internal/deploy/coverage_test.go b/internal/deploy/coverage_test.go new file mode 100644 index 0000000..427cb1c --- /dev/null +++ b/internal/deploy/coverage_test.go @@ -0,0 +1,523 @@ +package deploy + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync/atomic" + "testing" +) + +// Coverage uplift tests for Phase 1. These pin the error paths +// exercised in production but rare in the happy-path flow: +// - restoreFromBackups: file-didn't-exist-before deploy → +// rollback removes the new file (vs restoring bytes) +// - cleanupBackups: partial backup cleanup on early failure +// - writeTempFile: dir-creation race / O_EXCL collision +// - applyOwnership: chmod error / chown skipped when uid=-1 +// - lookupUID/lookupGID: empty-string and unresolvable cases +// - unixOwnerFromStat: nil safety +// - Apply: ownership-resolution failure midway through prep + +// TestApply_NewFileRollback_RemovesFile pins the +// no-backup-because-no-original case during PostCommit failure: +// the rollback removes the file rather than restoring (since +// there was nothing to restore). +func TestApply_NewFileRollback_RemovesFile(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "fresh.crt") + + postCalls := 0 + plan := Plan{ + Files: []File{{Path: cert, Bytes: []byte(testCert1)}}, + PostCommit: func(ctx context.Context) error { + postCalls++ + if postCalls == 1 { + return errors.New("nginx exited 1") + } + return nil + }, + } + res, err := Apply(context.Background(), plan) + if !errors.Is(err, ErrReloadFailed) { + t.Fatalf("expected ErrReloadFailed, got %v", err) + } + if !res.RolledBack { + t.Error("expected RolledBack=true") + } + // The file should no longer exist (rollback removed it + // because there was no backup to restore from). + if _, statErr := os.Stat(cert); statErr == nil { + t.Error("file still exists after rollback of new-file deploy") + } +} + +// TestApply_BackupReadFails_RollbackEscalates triggers the +// restoreFromBackups error path by deleting the backup before +// PostCommit fires (simulates an aggressive operator-side +// janitor). +func TestApply_BackupReadFails_RollbackEscalates(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte("ORIGINAL"), 0644); err != nil { + t.Fatal(err) + } + + var capturedBackup atomic.Value // string + plan := Plan{ + Files: []File{{Path: cert, Bytes: []byte(testCert1)}}, + PostCommit: func(ctx context.Context) error { + // Steal the backup BEFORE rollback runs. We have to + // find it via directory glob since Result isn't + // available yet. + entries, _ := os.ReadDir(dir) + for _, e := range entries { + if strings.Contains(e.Name(), BackupSuffix) { + capturedBackup.Store(filepath.Join(dir, e.Name())) + _ = os.Remove(filepath.Join(dir, e.Name())) + break + } + } + return errors.New("nginx exited 1") + }, + } + _, err := Apply(context.Background(), plan) + if !errors.Is(err, ErrRollbackFailed) { + t.Fatalf("expected ErrRollbackFailed, got %v", err) + } +} + +// TestApply_RenameMidLoopFails simulates a mid-loop rename +// failure by making the second destination's parent directory +// disappear after writeTempFile but before rename. We do this by +// using two destinations + removing the second's parent during +// PreCommit. +func TestApply_RenameMidLoopFails_PartialRollback(t *testing.T) { + dir := t.TempDir() + subA := filepath.Join(dir, "a") + subB := filepath.Join(dir, "b") + if err := os.MkdirAll(subA, 0755); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(subB, 0755); err != nil { + t.Fatal(err) + } + pathA := filepath.Join(subA, "tls.crt") + pathB := filepath.Join(subB, "tls.crt") + if err := os.WriteFile(pathA, []byte("ORIG-A"), 0644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(pathB, []byte("ORIG-B"), 0644); err != nil { + t.Fatal(err) + } + + plan := Plan{ + Files: []File{ + {Path: pathA, Bytes: []byte(testCert1)}, + {Path: pathB, Bytes: []byte(testCert2)}, + }, + PreCommit: func(ctx context.Context, tempPaths map[string]string) error { + // After temps are written + ownership applied, + // remove the SECOND temp file so its rename fails. + // The first will succeed (rename pathA's temp + // → pathA), then the loop will fail at pathB + // triggering the partial-rollback restore. + tempB := tempPaths[pathB] + _ = os.Remove(tempB) + return nil + }, + } + _, err := Apply(context.Background(), plan) + if err == nil { + t.Fatal("expected mid-loop rename failure") + } + // pathA should be restored to ORIG-A (rollback ran). + if got, _ := os.ReadFile(pathA); string(got) != "ORIG-A" { + t.Errorf("pathA = %q, want ORIG-A (partial rollback restore)", got) + } +} + +// TestCleanupBackups_RemovesGivenSet — directly exercise the +// cleanupBackups helper. Used internally on backup-step failure; +// usually unreachable through the public API. +func TestCleanupBackups_RemovesGivenSet(t *testing.T) { + dir := t.TempDir() + bp := filepath.Join(dir, "x"+BackupSuffix+"00000000") + if err := os.WriteFile(bp, []byte("backup data"), 0644); err != nil { + t.Fatal(err) + } + cleanupBackups(map[string]string{ + "/some/path": bp, + "/other": "", // empty entries should be ignored + }) + if _, err := os.Stat(bp); err == nil { + t.Error("backup not removed by cleanupBackups") + } +} + +// TestApplyOwnership_ChmodSkippedWhenModeNotSet verifies the +// branch where ModeSet is false (no chmod attempted). +func TestApplyOwnership_ChmodSkippedWhenModeNotSet(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "f") + if err := os.WriteFile(path, []byte("x"), 0644); err != nil { + t.Fatal(err) + } + res := resolvedOwnership{UID: -1, GID: -1, ModeSet: false} + if err := applyOwnership(path, res); err != nil { + t.Fatalf("applyOwnership: %v", err) + } + // File mode unchanged. + stat, _ := os.Stat(path) + if stat.Mode().Perm() != 0644 { + t.Errorf("mode = %#o, want 0644", stat.Mode().Perm()) + } +} + +// TestApplyOwnership_ChmodOnNonexistentFile returns the wrapped +// chmod error. +func TestApplyOwnership_ChmodOnNonexistentFile(t *testing.T) { + res := resolvedOwnership{Mode: 0644, ModeSet: true, UID: -1, GID: -1} + err := applyOwnership("/nonexistent/path/to/nothing", res) + if err == nil { + t.Fatal("expected error chmodding nonexistent file") + } + if !strings.Contains(err.Error(), "chmod") { + t.Errorf("error not labeled chmod: %v", err) + } +} + +// TestLookupUID_Empty + Unresolvable pin both error legs. +func TestLookupUID_ErrorLegs(t *testing.T) { + if _, err := lookupUID(""); err == nil { + t.Error("empty username should error") + } + if _, err := lookupUID("nonexistent-user-xyz-test-12345"); err == nil { + t.Error("unresolvable user should error") + } +} + +func TestLookupGID_ErrorLegs(t *testing.T) { + if _, err := lookupGID(""); err == nil { + t.Error("empty groupname should error") + } + if _, err := lookupGID("nonexistent-group-xyz-test-12345"); err == nil { + t.Error("unresolvable group should error") + } +} + +// TestUnixOwnerFromStat_NilFileInfo pins the nil safety. +func TestUnixOwnerFromStat_NilFileInfo(t *testing.T) { + uid, gid, ok := unixOwnerFromStat(nil) + if ok { + t.Errorf("ok=true for nil FileInfo (uid=%d, gid=%d)", uid, gid) + } + if uid != -1 || gid != -1 { + t.Errorf("uid/gid = %d/%d, want -1/-1", uid, gid) + } +} + +// TestApply_ResolveOwnershipError_AbortsBeforeAnyWrite triggers +// the resolveOwnership-fails branch (unresolvable owner string). +// No live files should be modified. +func TestApply_ResolveOwnershipError_AbortsBeforeAnyWrite(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte("ORIGINAL"), 0644); err != nil { + t.Fatal(err) + } + plan := Plan{ + Files: []File{{ + Path: cert, + Bytes: []byte(testCert1), + Owner: "nonexistent-user-xyz-12345", + Group: "nonexistent-group-xyz-12345", + }}, + } + _, err := Apply(context.Background(), plan) + if err == nil { + t.Fatal("expected error from unresolvable owner") + } + // File untouched. + if got, _ := os.ReadFile(cert); string(got) != "ORIGINAL" { + t.Errorf("file modified despite ownership-resolution failure: %q", got) + } +} + +// TestPruneBackups_BadDirectory pins the early error path. +func TestPruneBackups_BadDirectory(t *testing.T) { + err := pruneBackups("/nonexistent-parent-xyz/file", 3) + if err == nil { + t.Error("expected error reading nonexistent dir") + } +} + +// TestPruneBackups_KeepZeroOrNegative_NoOp pins the early-return +// branch. +func TestPruneBackups_KeepZeroOrNegative_NoOp(t *testing.T) { + dir := t.TempDir() + abs := filepath.Join(dir, "f") + bp := abs + BackupSuffix + "00001" + if err := os.WriteFile(bp, []byte("x"), 0644); err != nil { + t.Fatal(err) + } + if err := pruneBackups(abs, 0); err != nil { + t.Errorf("keep=0 error: %v", err) + } + if err := pruneBackups(abs, -1); err != nil { + t.Errorf("keep=-1 error: %v", err) + } + // Backup still exists. + if _, err := os.Stat(bp); err != nil { + t.Error("backup deleted under non-pruning retention") + } +} + +// TestAtomicWriteFile_BadOwnership exercises the +// resolveOwnership error path within the lower-level entry. +func TestAtomicWriteFile_BadOwnership(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "f") + _, err := AtomicWriteFile(context.Background(), path, []byte("x"), WriteOptions{ + Owner: "nonexistent-user-xyz-12345", + Group: "nonexistent-group-xyz-12345", + }) + if err == nil { + t.Error("expected error from bad ownership") + } +} + +// TestAtomicWriteFile_ContextCancelled before lock acquisition. +func TestAtomicWriteFile_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + dir := t.TempDir() + path := filepath.Join(dir, "f") + _, err := AtomicWriteFile(ctx, path, []byte("x"), WriteOptions{}) + if !errors.Is(err, context.Canceled) { + t.Errorf("got %v, want context.Canceled", err) + } +} + +// TestWriteTempFile_BadDir verifies the open-file error path. +func TestWriteTempFile_BadDir(t *testing.T) { + _, err := writeTempFile("/nonexistent-parent-xyz/file", []byte("x")) + if err == nil { + t.Error("expected error writing into nonexistent parent") + } +} + +// TestBackupFile_NonexistentSource pins the read-error path. +func TestBackupFile_NonexistentSource(t *testing.T) { + dir := t.TempDir() + _, err := backupFile(filepath.Join(dir, "does-not-exist")) + if err == nil { + t.Error("expected error backing up nonexistent file") + } +} + +// TestApply_SkipIdempotent_SecondPathExists_FirstNew exercises +// the partial-match branch where one file matches and one doesn't. +// Since not ALL match, the deploy proceeds normally for both. +func TestApply_PartialIdempotency_DeploysAll(t *testing.T) { + dir := t.TempDir() + a := filepath.Join(dir, "a.crt") + b := filepath.Join(dir, "b.crt") + if err := os.WriteFile(a, []byte(testCert1), 0644); err != nil { + t.Fatal(err) + } + // b doesn't exist yet — partial match. + + preCalls := 0 + plan := Plan{ + Files: []File{ + {Path: a, Bytes: []byte(testCert1)}, + {Path: b, Bytes: []byte(testCert2)}, + }, + PreCommit: func(ctx context.Context, _ map[string]string) error { + preCalls++ + return nil + }, + } + res, err := Apply(context.Background(), plan) + if err != nil { + t.Fatalf("Apply: %v", err) + } + if res.SkippedAsIdempotent { + t.Error("partial match should not skip") + } + if preCalls != 1 { + t.Errorf("PreCommit calls = %d, want 1", preCalls) + } +} + +// TestApply_FilePathInvalidAbs covers the filepath.Abs error +// branch. Hard to trigger on most platforms; the validation +// catches the empty case which IS triggerable. +func TestApply_FilePathEmpty_RejectedEarly(t *testing.T) { + plan := Plan{ + Files: []File{{Path: "", Bytes: []byte("x")}}, + } + _, err := Apply(context.Background(), plan) + if !errors.Is(err, ErrPlanInvalid) { + t.Errorf("got %v, want ErrPlanInvalid", err) + } +} + +// TestLockFile_RelativePathFallback covers the filepath.Abs +// failure-fallback branch in lockFile by acquiring + releasing +// a relative path lock. +func TestLockFile_RelativePath(t *testing.T) { + unlock := lockFile("relative/path/test") + unlock() + // Reacquiring should succeed (mutex released). + unlock = lockFile("relative/path/test") + unlock() +} + +// TestApply_NowNanosStr_FormatStable double-checks the +// lex-sortable format used by pruneBackups for chronological +// ordering. +func TestNowNanosStr_FormatStable(t *testing.T) { + a := nowNanosStr() + if len(a) != 19 { + t.Errorf("len = %d, want 19 (zero-padded for sort)", len(a)) + } + for _, c := range a { + if c < '0' || c > '9' { + t.Errorf("non-digit in nano string: %c", c) + } + } +} + +// TestApply_RestoreFails_RenameAfterChmodReadOnly triggers the +// "rename during restore fails" branch by chmodding the parent +// directory to read-only AFTER the temp file is renamed in but +// BEFORE PostCommit fires (so the rollback's restore-rename +// fails). This tests the deepest leg of restoreFromBackups. +func TestApply_RestoreFails_RenameAfterChmodReadOnly(t *testing.T) { + if os.Getuid() == 0 { + t.Skip("read-only chmod doesn't restrict root") + } + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte("ORIGINAL"), 0644); err != nil { + t.Fatal(err) + } + defer func() { + // Ensure cleanup can proceed. + _ = os.Chmod(dir, 0755) + }() + + plan := Plan{ + Files: []File{{Path: cert, Bytes: []byte(testCert1)}}, + PostCommit: func(ctx context.Context) error { + // Make the directory read-only so the subsequent + // restore-rename will fail. + _ = os.Chmod(dir, 0555) + return errors.New("nginx exited 1") + }, + } + _, err := Apply(context.Background(), plan) + if err == nil { + t.Fatal("expected error") + } + // Either ErrReloadFailed (rollback succeeded somehow) or + // ErrRollbackFailed (rollback couldn't restore due to RO). + if !errors.Is(err, ErrReloadFailed) && !errors.Is(err, ErrRollbackFailed) { + t.Errorf("got %v, want ErrReloadFailed or ErrRollbackFailed", err) + } +} + +// TestApply_DuplicateNormalisedPath catches the validatePlan +// duplicate detection after filepath.Abs normalisation. +func TestApply_DuplicateNormalisedPath(t *testing.T) { + dir := t.TempDir() + a := filepath.Join(dir, "x.crt") + // Same logical destination via a relative + absolute mix. + plan := Plan{ + Files: []File{ + {Path: a, Bytes: []byte("a")}, + {Path: a, Bytes: []byte("b")}, + }, + } + _, err := Apply(context.Background(), plan) + if !errors.Is(err, ErrPlanInvalid) { + t.Errorf("got %v, want ErrPlanInvalid", err) + } +} + +// TestUnixOwnerFromStat_LiveStat covers the happy path with a +// real os.Stat result. +func TestUnixOwnerFromStat_LiveStat(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "f") + if err := os.WriteFile(path, []byte("x"), 0644); err != nil { + t.Fatal(err) + } + stat, err := os.Stat(path) + if err != nil { + t.Fatal(err) + } + uid, gid, ok := unixOwnerFromStat(stat) + if !ok { + t.Skip("non-unix") + } + if uid != os.Getuid() || gid != os.Getgid() { + t.Errorf("uid/gid = %d/%d, want %d/%d", uid, gid, os.Getuid(), os.Getgid()) + } +} + +// TestBackupFile_StatFailsAfterRead triggers the rare +// "file deleted between read and stat" race-window branch in +// backupFile by using a path that disappears mid-call. We can't +// easily race it, but we can show the read-then-stat ordering by +// checking that backupFile of a missing file errors at read. +// Already covered by TestBackupFile_NonexistentSource above; this +// is a placeholder so the package's race-aware code path is +// documented. +func TestBackupFile_RaceWindow_DocumentedInCode(t *testing.T) { + t.Log("backupFile race window between read+stat is documented but not faulttested without fault injection") +} + +// TestWriteTempFile_OEXCLContention pins the O_EXCL belt-and- +// braces protection in writeTempFile. Hard to trigger externally +// because nowNanosStr() is monotonic; we exercise the protection +// by pre-creating a file at the temp path and checking that a +// second write to the same nanos collides + errors. This requires +// freezing the clock — skipped (impractical) — but the test +// documents the existence of the protection. +func TestWriteTempFile_OEXCLContention_DocumentedInCode(t *testing.T) { + t.Log("O_EXCL collision branch defends against clock collision; not test-injectable without time mock") +} + +// TestApply_BackupRetentionDefault verifies the default-of-3 +// behavior when BackupRetention is left zero. +func TestApply_BackupRetentionDefault(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte("V0"), 0644); err != nil { + t.Fatal(err) + } + for i := 1; i <= 6; i++ { + plan := Plan{ + Files: []File{{Path: cert, Bytes: []byte(fmt.Sprintf("V%d", i))}}, + } + if _, err := Apply(context.Background(), plan); err != nil { + t.Fatalf("Apply iter %d: %v", i, err) + } + } + entries, _ := os.ReadDir(dir) + count := 0 + for _, e := range entries { + if strings.Contains(e.Name(), BackupSuffix) { + count++ + } + } + if count != DefaultBackupRetention { + t.Errorf("backup count = %d, want %d (default)", count, DefaultBackupRetention) + } +} diff --git a/internal/deploy/deploy_test.go b/internal/deploy/deploy_test.go new file mode 100644 index 0000000..fefea5d --- /dev/null +++ b/internal/deploy/deploy_test.go @@ -0,0 +1,820 @@ +package deploy + +import ( + "context" + "errors" + "fmt" + "os" + "os/user" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "testing" + "time" +) + +// Phase 1 of the deploy-hardening I master bundle. The 12 named +// tests below pin the load-bearing invariants of the +// internal/deploy/ package: atomic-or-nothing across files, +// validate-fail-cleans-up, reload-fail-rolls-back, +// rollback-also-fails-escalates, SHA-256 idempotency, +// owner/mode preservation + override, file-level serialization, +// backup retention janitor, and AtomicWriteFile temp-file + +// rename-race correctness. +// +// All 12 are required by the prompt at +// cowork/deploy-hardening-i-prompt.md::"Test plan (Phase 1 +// ships ≥95% coverage on the new package)". +// +// The tests run in non-root environments — they do NOT exercise +// cross-user chown (which requires CAP_CHOWN). The chown wiring +// is exercised via the same-user case (chown to os.Getuid() +// always succeeds) + the resolveOwnership white-box tests. + +const testCert1 = "-----BEGIN CERTIFICATE-----\nFAKE-CERT-1-PAYLOAD\n-----END CERTIFICATE-----\n" +const testCert2 = "-----BEGIN CERTIFICATE-----\nFAKE-CERT-2-DIFFERENT\n-----END CERTIFICATE-----\n" + +// TestApply_HappyPath_PreCommitSucceeds_PostCommitSucceeds_FilesAtomic +// pins the canonical happy path: write multiple files, validate +// passes, all atomic-rename, reload passes. Every File ends up +// with the new bytes; PreCommit + PostCommit each fired once. +func TestApply_HappyPath_PreCommitSucceeds_PostCommitSucceeds_FilesAtomic(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + key := filepath.Join(dir, "tls.key") + + preCalls, postCalls := 0, 0 + var seenTempPaths map[string]string + plan := Plan{ + Files: []File{ + {Path: cert, Bytes: []byte(testCert1)}, + {Path: key, Bytes: []byte(testCert2)}, + }, + PreCommit: func(ctx context.Context, tempPaths map[string]string) error { + preCalls++ + seenTempPaths = tempPaths + // Both temp files exist + readable + carry the new + // bytes (the load-bearing invariant for "validate- + // against-temp" semantics). + for finalPath, tempPath := range tempPaths { + if _, err := os.Stat(tempPath); err != nil { + return fmt.Errorf("temp for %s missing: %w", finalPath, err) + } + } + return nil + }, + PostCommit: func(ctx context.Context) error { + postCalls++ + return nil + }, + } + + res, err := Apply(context.Background(), plan) + if err != nil { + t.Fatalf("Apply: %v", err) + } + if res.SkippedAsIdempotent { + t.Errorf("expected fresh write, got idempotent skip") + } + if !res.ValidateOK || !res.Reloaded { + t.Errorf("ValidateOK=%v Reloaded=%v, want true/true", res.ValidateOK, res.Reloaded) + } + if preCalls != 1 || postCalls != 1 { + t.Errorf("PreCommit/PostCommit calls = %d/%d, want 1/1", preCalls, postCalls) + } + if len(seenTempPaths) != 2 { + t.Errorf("PreCommit saw %d temp paths, want 2", len(seenTempPaths)) + } + // Final files have new bytes. + if got, _ := os.ReadFile(cert); string(got) != testCert1 { + t.Errorf("cert content = %q, want %q", got, testCert1) + } + if got, _ := os.ReadFile(key); string(got) != testCert2 { + t.Errorf("key content = %q, want %q", got, testCert2) + } +} + +// TestApply_PreCommitFails_NoFilesChanged pins the all-or-nothing +// invariant on the validate path: PreCommit returns an error → +// neither destination is touched, ErrValidateFailed is returned. +func TestApply_PreCommitFails_NoFilesChanged(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + key := filepath.Join(dir, "tls.key") + if err := os.WriteFile(cert, []byte("ORIGINAL-CERT"), 0644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(key, []byte("ORIGINAL-KEY"), 0600); err != nil { + t.Fatal(err) + } + + postCalls := 0 + plan := Plan{ + Files: []File{ + {Path: cert, Bytes: []byte(testCert1)}, + {Path: key, Bytes: []byte(testCert2)}, + }, + PreCommit: func(ctx context.Context, tempPaths map[string]string) error { + return errors.New("nginx -t says: invalid SAN") + }, + PostCommit: func(ctx context.Context) error { + postCalls++ + return nil + }, + } + + _, err := Apply(context.Background(), plan) + if !errors.Is(err, ErrValidateFailed) { + t.Fatalf("expected ErrValidateFailed, got %v", err) + } + if postCalls != 0 { + t.Errorf("PostCommit called %d times after PreCommit failure, want 0", postCalls) + } + // Both destinations untouched. + if got, _ := os.ReadFile(cert); string(got) != "ORIGINAL-CERT" { + t.Errorf("cert was modified despite PreCommit failure: %q", got) + } + if got, _ := os.ReadFile(key); string(got) != "ORIGINAL-KEY" { + t.Errorf("key was modified despite PreCommit failure: %q", got) + } + // No temp files leaked. + entries, _ := os.ReadDir(dir) + for _, e := range entries { + if strings.Contains(e.Name(), TempSuffix) { + t.Errorf("temp file leaked: %s", e.Name()) + } + } +} + +// TestApply_PostCommitFails_FilesRolledBack pins the rollback +// wire: PostCommit fails → restore from backup → re-call +// PostCommit → second one succeeds → return ErrReloadFailed + +// RolledBack=true. The destinations now hold the ORIGINAL bytes. +func TestApply_PostCommitFails_FilesRolledBack(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte("ORIGINAL"), 0644); err != nil { + t.Fatal(err) + } + + postCalls := 0 + plan := Plan{ + Files: []File{ + {Path: cert, Bytes: []byte(testCert1)}, + }, + PostCommit: func(ctx context.Context) error { + postCalls++ + if postCalls == 1 { + return errors.New("nginx -s reload exited 1") + } + return nil + }, + } + + res, err := Apply(context.Background(), plan) + if !errors.Is(err, ErrReloadFailed) { + t.Fatalf("expected ErrReloadFailed, got %v", err) + } + if !res.RolledBack { + t.Error("expected RolledBack=true") + } + if res.Reloaded { + t.Error("expected Reloaded=false after rollback") + } + if postCalls != 2 { + t.Errorf("PostCommit calls = %d, want 2 (once for the new bytes, once for the restored bytes)", postCalls) + } + if got, _ := os.ReadFile(cert); string(got) != "ORIGINAL" { + t.Errorf("cert after rollback = %q, want %q", got, "ORIGINAL") + } +} + +// TestApply_RollbackAlsoFails_ReturnsErrRollbackFailed is the +// escalation path: PostCommit fails + the second PostCommit (after +// restore) also fails. ErrRollbackFailed surfaces; +// operator-actionable. +func TestApply_RollbackAlsoFails_ReturnsErrRollbackFailed(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte("ORIGINAL"), 0644); err != nil { + t.Fatal(err) + } + + plan := Plan{ + Files: []File{ + {Path: cert, Bytes: []byte(testCert1)}, + }, + PostCommit: func(ctx context.Context) error { + return errors.New("nginx is wedged") + }, + } + + _, err := Apply(context.Background(), plan) + if !errors.Is(err, ErrRollbackFailed) { + t.Fatalf("expected ErrRollbackFailed, got %v", err) + } +} + +// TestApply_IdempotentSkip_SHA256Match pins the idempotency +// short-circuit: when every File's destination already matches +// SHA-256, neither PreCommit nor PostCommit fires; the result +// reports SkippedAsIdempotent=true. +func TestApply_IdempotentSkip_SHA256Match(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte(testCert1), 0644); err != nil { + t.Fatal(err) + } + + preCalls, postCalls := 0, 0 + plan := Plan{ + Files: []File{ + {Path: cert, Bytes: []byte(testCert1)}, + }, + PreCommit: func(ctx context.Context, _ map[string]string) error { + preCalls++ + return nil + }, + PostCommit: func(ctx context.Context) error { + postCalls++ + return nil + }, + } + res, err := Apply(context.Background(), plan) + if err != nil { + t.Fatalf("Apply: %v", err) + } + if !res.SkippedAsIdempotent { + t.Error("expected SkippedAsIdempotent=true") + } + if preCalls != 0 || postCalls != 0 { + t.Errorf("expected no Pre/PostCommit calls, got %d/%d", preCalls, postCalls) + } + if len(res.BackupPaths) != 0 { + t.Errorf("expected zero backups for idempotent skip, got %d", len(res.BackupPaths)) + } + + // Verify SkipIdempotent forces the calls. + plan.SkipIdempotent = true + res, err = Apply(context.Background(), plan) + if err != nil { + t.Fatalf("Apply with SkipIdempotent: %v", err) + } + if res.SkippedAsIdempotent { + t.Error("expected SkipIdempotent override to force the deploy") + } + if preCalls != 1 || postCalls != 1 { + t.Errorf("expected 1/1 calls under SkipIdempotent, got %d/%d", preCalls, postCalls) + } +} + +// TestApply_PreservesExistingOwnerAndMode_WhenNotOverridden pins +// the silent-failure-mode-defense: an existing nginx:nginx 0640 +// file MUST stay nginx:nginx 0640 across a renewal, NOT get +// clobbered to root:root 0600. +// +// We can't actually create a non-current-user file in a non-root +// test, so this test verifies mode preservation only (the chown +// preservation is exercised by the resolveOwnership unit test +// below). +func TestApply_PreservesExistingOwnerAndMode_WhenNotOverridden(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + // Pre-existing file with very specific mode. + if err := os.WriteFile(cert, []byte("ORIGINAL"), 0640); err != nil { + t.Fatal(err) + } + // Some umasks downgrade 0640 → 0620; force the desired bits + // after creation. + if err := os.Chmod(cert, 0640); err != nil { + t.Fatal(err) + } + + plan := Plan{ + Files: []File{ + {Path: cert, Bytes: []byte(testCert1)}, // no Mode/Owner/Group set + }, + } + if _, err := Apply(context.Background(), plan); err != nil { + t.Fatalf("Apply: %v", err) + } + stat, err := os.Stat(cert) + if err != nil { + t.Fatal(err) + } + if stat.Mode().Perm() != 0640 { + t.Errorf("mode after deploy = %#o, want %#o (preservation broken)", stat.Mode().Perm(), os.FileMode(0640)) + } +} + +// TestApply_RespectsOverrides_OwnerGroupMode pins the override +// path: when File.Mode is set, the existing mode is overridden. +// We use the current user/group so chown succeeds on non-root. +func TestApply_RespectsOverrides_OwnerGroupMode(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte("ORIGINAL"), 0640); err != nil { + t.Fatal(err) + } + if err := os.Chmod(cert, 0640); err != nil { + t.Fatal(err) + } + + currentUser, err := user.Current() + if err != nil { + t.Fatal(err) + } + currentGroup, err := user.LookupGroupId(currentUser.Gid) + if err != nil { + t.Fatal(err) + } + + plan := Plan{ + Files: []File{{ + Path: cert, + Bytes: []byte(testCert1), + Mode: 0644, + Owner: currentUser.Username, + Group: currentGroup.Name, + }}, + } + if _, err := Apply(context.Background(), plan); err != nil { + t.Fatalf("Apply: %v", err) + } + stat, err := os.Stat(cert) + if err != nil { + t.Fatal(err) + } + if stat.Mode().Perm() != 0644 { + t.Errorf("override mode = %#o, want 0644", stat.Mode().Perm()) + } +} + +// TestApply_ConcurrentApplyToSameFile_Serializes pins the +// file-level mutex: 10 concurrent Applies to the same destination +// see exactly 10 PostCommit invocations and the file ends with +// one of the writers' bytes (no torn write). +func TestApply_ConcurrentApplyToSameFile_Serializes(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + + const N = 10 + var inFlight, maxInFlight int32 + var postCount int32 + var wg sync.WaitGroup + for i := 0; i < N; i++ { + wg.Add(1) + go func(idx int) { + defer wg.Done() + plan := Plan{ + Files: []File{{ + Path: cert, + Bytes: []byte(fmt.Sprintf("WRITER-%d", idx)), + }}, + SkipIdempotent: true, // force every call through the full path + PostCommit: func(ctx context.Context) error { + n := atomic.AddInt32(&inFlight, 1) + for { + m := atomic.LoadInt32(&maxInFlight) + if n <= m || atomic.CompareAndSwapInt32(&maxInFlight, m, n) { + break + } + } + time.Sleep(2 * time.Millisecond) + atomic.AddInt32(&inFlight, -1) + atomic.AddInt32(&postCount, 1) + return nil + }, + } + if _, err := Apply(context.Background(), plan); err != nil { + t.Errorf("Apply: %v", err) + } + }(i) + } + wg.Wait() + + if postCount != N { + t.Errorf("postCount = %d, want %d", postCount, N) + } + if maxInFlight > 1 { + t.Errorf("max concurrent PostCommit = %d, want 1 (serialization broken)", maxInFlight) + } + // File must contain exactly one of the writers' contents. + got, _ := os.ReadFile(cert) + if !strings.HasPrefix(string(got), "WRITER-") { + t.Errorf("file content not from any writer: %q", got) + } +} + +// TestApply_BackupRetention_KeepsLastN pins the janitor: after +// many deploys, only the last N backups remain. +func TestApply_BackupRetention_KeepsLastN(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + + // Initial file. + if err := os.WriteFile(cert, []byte("V0"), 0644); err != nil { + t.Fatal(err) + } + + const keep = 2 + for i := 1; i <= 5; i++ { + plan := Plan{ + Files: []File{{ + Path: cert, + Bytes: []byte(fmt.Sprintf("V%d", i)), + }}, + BackupRetention: keep, + } + if _, err := Apply(context.Background(), plan); err != nil { + t.Fatalf("Apply iter %d: %v", i, err) + } + // Stagger to ensure distinct nanosecond stamps. + time.Sleep(2 * time.Millisecond) + } + + entries, _ := os.ReadDir(dir) + count := 0 + for _, e := range entries { + if strings.Contains(e.Name(), BackupSuffix) { + count++ + } + } + if count != keep { + t.Errorf("backup count after 5 deploys with retention=%d = %d, want %d", keep, count, keep) + } +} + +// TestApply_NoExistingFile_UsesDefaultsForOwnerGroupMode covers +// the first-deploy path: destination doesn't exist; FileDefaults +// applies. We verify the mode default lands; owner/group default +// is exercised in resolveOwnership unit tests (would require root +// for cross-user chown). +func TestApply_NoExistingFile_UsesDefaultsForOwnerGroupMode(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + + plan := Plan{ + Files: []File{ + {Path: cert, Bytes: []byte(testCert1)}, + }, + Defaults: FileDefaults{Mode: 0640}, + } + if _, err := Apply(context.Background(), plan); err != nil { + t.Fatalf("Apply: %v", err) + } + stat, err := os.Stat(cert) + if err != nil { + t.Fatal(err) + } + if stat.Mode().Perm() != 0640 { + t.Errorf("default mode for new file = %#o, want 0640", stat.Mode().Perm()) + } +} + +// TestAtomicWriteFile_TempFileCleanedUpOnError checks that a +// failure mid-flight (we simulate by passing an unwritable +// directory) leaves no .certctl-tmp.* file behind. +func TestAtomicWriteFile_TempFileCleanedUpOnError(t *testing.T) { + dir := t.TempDir() + // Make the directory read-only AFTER the temp open would fail. + // Easier: target a path inside a directory that doesn't exist. + ghost := filepath.Join(dir, "does-not-exist", "tls.crt") + _, err := AtomicWriteFile(context.Background(), ghost, []byte(testCert1), WriteOptions{}) + if err == nil { + t.Fatal("expected error writing into nonexistent directory") + } + // No leaked temps in the parent (which does exist). + entries, _ := os.ReadDir(dir) + for _, e := range entries { + if strings.Contains(e.Name(), TempSuffix) { + t.Errorf("temp file leaked: %s", e.Name()) + } + } +} + +// TestAtomicWriteFile_RenameRaceWithReader_AtomicReadAlwaysSeesOldOrNew +// pins the load-bearing POSIX-rename atomicity: a concurrent +// reader hitting the destination during a write either sees the +// pre-write bytes or the post-write bytes; never an intermediate +// state. +func TestAtomicWriteFile_RenameRaceWithReader_AtomicReadAlwaysSeesOldOrNew(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "tls.crt") + old := []byte(strings.Repeat("OLD", 1000)) + newer := []byte(strings.Repeat("NEW", 1000)) + if err := os.WriteFile(path, old, 0644); err != nil { + t.Fatal(err) + } + + stop := make(chan struct{}) + var torn atomic.Bool + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-stop: + return + default: + } + b, err := os.ReadFile(path) + if err != nil { + continue + } + s := string(b) + if s != string(old) && s != string(newer) { + torn.Store(true) + return + } + } + }() + + // Issue many writes back and forth. + for i := 0; i < 30; i++ { + writeBytes := old + if i%2 == 0 { + writeBytes = newer + } + if _, err := AtomicWriteFile(context.Background(), path, writeBytes, WriteOptions{ + SkipIdempotent: true, + }); err != nil { + t.Fatalf("AtomicWriteFile %d: %v", i, err) + } + } + close(stop) + wg.Wait() + if torn.Load() { + t.Error("torn read observed (rename was not atomic)") + } +} + +// --- White-box tests for resolveOwnership (chown semantics under +// non-root require this, since we can't write a chown-to-root +// integration test without sudo). --- + +// TestResolveOwnership_ExplicitOverride_Wins verifies that an +// explicit File.Mode/Owner/Group beats both existing-file +// preservation and Defaults fallback. +func TestResolveOwnership_ExplicitOverride_Wins(t *testing.T) { + currentUser, _ := user.Current() + currentGroup, _ := user.LookupGroupId(currentUser.Gid) + + dir := t.TempDir() + path := filepath.Join(dir, "f") + if err := os.WriteFile(path, []byte("x"), 0600); err != nil { + t.Fatal(err) + } + stat, _ := os.Stat(path) + res, err := resolveOwnership(File{ + Path: path, + Mode: 0644, + Owner: currentUser.Username, + Group: currentGroup.Name, + }, FileDefaults{Mode: 0400, Owner: "nobody", Group: "nogroup"}, stat) + if err != nil { + t.Fatal(err) + } + if res.Mode != 0644 { + t.Errorf("mode = %#o, want 0644 (override should win)", res.Mode) + } + if res.OwnerLabel != currentUser.Username { + t.Errorf("owner label = %q, want %q (override should win)", res.OwnerLabel, currentUser.Username) + } +} + +// TestResolveOwnership_PreservesExisting_WhenNoOverride verifies +// the preservation path: no explicit override + existing file → +// existing uid/gid/mode are returned. +func TestResolveOwnership_PreservesExisting_WhenNoOverride(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "f") + if err := os.WriteFile(path, []byte("x"), 0640); err != nil { + t.Fatal(err) + } + if err := os.Chmod(path, 0640); err != nil { + t.Fatal(err) + } + stat, _ := os.Stat(path) + + res, err := resolveOwnership(File{Path: path}, FileDefaults{Mode: 0400}, stat) + if err != nil { + t.Fatal(err) + } + if res.Mode != 0640 { + t.Errorf("mode = %#o, want 0640 (preservation)", res.Mode) + } + uid, gid, ok := unixOwnerFromStat(stat) + if !ok { + t.Skip("non-unix platform") + } + if res.UID != uid || res.GID != gid { + t.Errorf("uid/gid = %d/%d, want %d/%d", res.UID, res.GID, uid, gid) + } +} + +// TestResolveOwnership_NewFile_FallsBackToDefaults verifies the +// defaults path: no override + no existing file → Plan.Defaults. +func TestResolveOwnership_NewFile_FallsBackToDefaults(t *testing.T) { + currentUser, _ := user.Current() + currentGroup, _ := user.LookupGroupId(currentUser.Gid) + + res, err := resolveOwnership(File{Path: "/tmp/never"}, FileDefaults{ + Mode: 0640, + Owner: currentUser.Username, + Group: currentGroup.Name, + }, nil) + if err != nil { + t.Fatal(err) + } + if res.Mode != 0640 { + t.Errorf("mode = %#o, want 0640 (default)", res.Mode) + } + if res.OwnerLabel != currentUser.Username { + t.Errorf("owner = %q, want %q (default)", res.OwnerLabel, currentUser.Username) + } +} + +// TestApply_RejectsInvalidPlan_NoFiles + duplicate-paths + empty- +// path. Pin the validatePlan gate. +func TestApply_RejectsInvalidPlan(t *testing.T) { + tests := []struct { + name string + plan Plan + }{ + {"no files", Plan{}}, + {"empty path", Plan{Files: []File{{Path: "", Bytes: []byte("x")}}}}, + {"duplicate", Plan{Files: []File{ + {Path: "/tmp/dup", Bytes: []byte("a")}, + {Path: "/tmp/dup", Bytes: []byte("b")}, + }}}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := Apply(context.Background(), tc.plan) + if !errors.Is(err, ErrPlanInvalid) { + t.Errorf("got %v, want ErrPlanInvalid", err) + } + }) + } +} + +// TestApply_ContextCancelledBeforeStart_AbortsCleanly pins the +// context-respect contract: a cancelled context aborts before +// any I/O. +func TestApply_ContextCancelledBeforeStart_AbortsCleanly(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + ctx, cancel := context.WithCancel(context.Background()) + cancel() + _, err := Apply(ctx, Plan{ + Files: []File{{Path: cert, Bytes: []byte(testCert1)}}, + }) + if err == nil || !errors.Is(err, context.Canceled) { + t.Errorf("got %v, want context.Canceled", err) + } + if _, statErr := os.Stat(cert); statErr == nil { + t.Error("file was created despite cancelled context") + } +} + +// TestApply_NoBackupRetention_DisablesBackups pins +// BackupRetention = -1 sentinel: no backup created; rollback +// becomes impossible. +func TestApply_NoBackupRetention_DisablesBackups(t *testing.T) { + dir := t.TempDir() + cert := filepath.Join(dir, "tls.crt") + if err := os.WriteFile(cert, []byte("ORIGINAL"), 0644); err != nil { + t.Fatal(err) + } + plan := Plan{ + Files: []File{{Path: cert, Bytes: []byte(testCert1)}}, + BackupRetention: -1, + } + if _, err := Apply(context.Background(), plan); err != nil { + t.Fatalf("Apply: %v", err) + } + entries, _ := os.ReadDir(dir) + for _, e := range entries { + if strings.Contains(e.Name(), BackupSuffix) { + t.Errorf("backup created despite BackupRetention=-1: %s", e.Name()) + } + } +} + +// TestAtomicWriteFile_HappyPath_ReplacesExistingAtomically covers +// the simple AtomicWriteFile path used by F5 + K8s connectors. +func TestAtomicWriteFile_HappyPath_ReplacesExistingAtomically(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "f") + if err := os.WriteFile(path, []byte("OLD"), 0644); err != nil { + t.Fatal(err) + } + res, err := AtomicWriteFile(context.Background(), path, []byte("NEW"), WriteOptions{}) + if err != nil { + t.Fatalf("AtomicWriteFile: %v", err) + } + if !res.Replaced { + t.Error("Replaced=false; want true") + } + if res.BackupPath == "" { + t.Error("expected non-empty BackupPath") + } + if got, _ := os.ReadFile(path); string(got) != "NEW" { + t.Errorf("file = %q, want NEW", got) + } +} + +// TestAtomicWriteFile_IdempotentSkip covers the AtomicWriteFile +// SHA-256 skip — same coverage as Plan.Apply but for the lower- +// level entry point used by F5/K8s. +func TestAtomicWriteFile_IdempotentSkip(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "f") + if err := os.WriteFile(path, []byte("SAME"), 0644); err != nil { + t.Fatal(err) + } + res, err := AtomicWriteFile(context.Background(), path, []byte("SAME"), WriteOptions{}) + if err != nil { + t.Fatalf("AtomicWriteFile: %v", err) + } + if !res.Idempotent { + t.Error("Idempotent=false; want true") + } + if res.Replaced { + t.Error("Replaced=true on idempotent skip; want false") + } +} + +// TestAtomicWriteFile_RejectsEmptyPath pins the input validation. +func TestAtomicWriteFile_RejectsEmptyPath(t *testing.T) { + _, err := AtomicWriteFile(context.Background(), "", []byte("x"), WriteOptions{}) + if !errors.Is(err, ErrPlanInvalid) { + t.Errorf("got %v, want ErrPlanInvalid", err) + } +} + +// TestPruneBackups_NoOp_WhenUnderRetention pins the early return +// when there are fewer backups than the retention bar. +func TestPruneBackups_NoOp_WhenUnderRetention(t *testing.T) { + dir := t.TempDir() + abs := filepath.Join(dir, "f") + // Create two backup-style files. + os.WriteFile(abs+BackupSuffix+"0000000000000000001", []byte("a"), 0644) + os.WriteFile(abs+BackupSuffix+"0000000000000000002", []byte("b"), 0644) + if err := pruneBackups(abs, 5); err != nil { + t.Fatal(err) + } + entries, _ := os.ReadDir(dir) + count := 0 + for _, e := range entries { + if strings.Contains(e.Name(), BackupSuffix) { + count++ + } + } + if count != 2 { + t.Errorf("count = %d, want 2 (no pruning under retention)", count) + } +} + +// TestLookupUID_Numeric covers the "numeric passthrough" branch +// of lookupUID — agents can configure with either "nginx" or "1000". +func TestLookupUID_Numeric(t *testing.T) { + uid, err := lookupUID("12345") + if err != nil { + t.Fatal(err) + } + if uid != 12345 { + t.Errorf("uid = %d, want 12345", uid) + } +} + +// TestLookupGID_Numeric mirror. +func TestLookupGID_Numeric(t *testing.T) { + gid, err := lookupGID("54321") + if err != nil { + t.Fatal(err) + } + if gid != 54321 { + t.Errorf("gid = %d, want 54321", gid) + } +} + +// TestSHA256Eq_EdgeCases pins the helper used by the idempotency +// short-circuit. +func TestSHA256Eq_EdgeCases(t *testing.T) { + if !sha256Eq([]byte{}, []byte{}) { + t.Error("empty == empty failed") + } + if sha256Eq([]byte("a"), []byte("b")) { + t.Error("a == b unexpectedly true") + } + if sha256Eq([]byte("ab"), []byte("ac")) { + t.Error("ab == ac unexpectedly true") + } + if !sha256Eq([]byte("abc"), []byte("abc")) { + t.Error("abc == abc failed") + } +} diff --git a/internal/deploy/doc.go b/internal/deploy/doc.go new file mode 100644 index 0000000..8dc48c7 --- /dev/null +++ b/internal/deploy/doc.go @@ -0,0 +1,69 @@ +// Package deploy provides the shared atomic-write + validate + rollback +// primitive consumed by every target connector under +// internal/connector/target/*. +// +// The deploy package closes the three procurement-checklist items where +// commercial competitors (Venafi, DigiCert Certificate Manager, Sectigo) +// historically beat certctl on a head-to-head deployment-grade +// comparison: +// +// 1. Atomic deploy with rollback — every file write is "all or nothing". +// A connector can never leave a target in a half-deployed state where +// the cert is updated but the chain isn't (or vice versa). Ships via +// Plan + Apply: temp-write all files together, run validate, atomic +// rename them all, run reload; on reload failure restore previous +// bytes + reload again. +// 2. Post-deploy TLS verification — the Apply caller wires its own +// PostCommit to do a TLS handshake against the target endpoint and +// compare the leaf-cert SHA-256 against what was just written. The +// deploy package surfaces the rollback wire when PostCommit fails; +// the connector decides what failure means. +// 3. (Vendor-specific deployment recipes — out of scope for the deploy +// package; covered in Bundle II.) +// +// Design tenets — all load-bearing for 13 connectors: +// +// - All-or-nothing across files. A Plan with N File entries either +// succeeds for all N or rolls back all N. No "two of three written" +// intermediate states are possible from a successful or failed Apply. +// - Cross-filesystem safety. Temp files always live in the same +// directory as the final destination, so os.Rename is guaranteed +// atomic on POSIX (a rename within the same filesystem). Writing +// temp files in /tmp would silently fall back to copy-and-rename +// across filesystems, breaking atomicity. +// - Idempotency. If every File's destination already has identical +// bytes (SHA-256 match), Apply returns SkippedAsIdempotent=true and +// calls neither PreCommit nor PostCommit. Defends against agent +// restart retry storms that would otherwise hammer the target with +// no-op reloads. +// - Ownership + mode preservation. The single most common +// silent-failure mode in cert deploys is the agent running as root +// calling os.WriteFile(path, bytes, 0600), which clobbers the +// existing nginx:nginx 0640 ownership and locks NGINX out of the +// key file. Apply preserves the existing destination's +// owner+group+mode unless the per-target config overrides; for new +// files it falls back to per-target-type defaults (e.g. nginx:nginx +// 0640). +// - Per-file serialization. The package keeps a sync.Map of file-level +// mutexes so two concurrent Apply calls touching the same path +// serialize. (Per-target serialization is Phase 2's job in the +// agent dispatch; this is a finer-grained file-level guard.) +// - Backup retention. Each successful write copies the previous bytes +// to .certctl-bak.. A janitor prunes to the last +// N backups (default 3, configurable via Plan.BackupRetention or +// the CERTCTL_DEPLOY_BACKUP_RETENTION env var the agent passes in). +// Setting retention to 0 disables backups entirely — rollback +// becomes impossible; documented as a foot-gun. +// +// Origin: this package was created in the deploy-hardening I master +// bundle (Phase 1) as the load-bearing replacement for the duplicated +// os.WriteFile flows in 13 connectors. The Apply API mirrors the F5 +// transaction model already at internal/connector/target/f5/f5.go:267 +// — F5 was the only connector with rollback semantics before this +// bundle. Apply lifts that pattern up so every other connector gets +// the same atomicity bar without re-implementing it. +// +// Concurrency: every exported function is safe for concurrent callers. +// File-level serialization is automatic via the package-internal +// sync.Map of mutexes; callers do not need their own per-file lock. +package deploy diff --git a/internal/deploy/ownership.go b/internal/deploy/ownership.go new file mode 100644 index 0000000..104129e --- /dev/null +++ b/internal/deploy/ownership.go @@ -0,0 +1,185 @@ +package deploy + +import ( + "errors" + "fmt" + "os" + "os/user" + "strconv" + "syscall" +) + +// resolvedOwnership describes the final (mode, uid, gid) to apply +// to a destination file. Resolution honors the precedence: +// +// 1. Explicit File.Mode/Owner/Group → use as given +// 2. Existing destination file → preserve that file's mode/uid/gid +// 3. Plan.Defaults / WriteOptions.Default* → use as fallback +// 4. Nothing set → leave as os.WriteFile default (file mode = 0644 +// for new files; uid/gid = process-effective) +// +// uid / gid are -1 when no chown should occur (no override AND no +// existing file AND no default → leave as-is). +type resolvedOwnership struct { + Mode os.FileMode + UID int // -1 = do not chown + GID int // -1 = do not chgrp (must come together with UID) + ModeSet bool + OwnerLabel string // best-effort string for diagnostics ("" if unknown) + GroupLabel string +} + +// resolveOwnership computes the final mode/uid/gid for a file. +// existingStat is nil when the destination does not exist. +func resolveOwnership(file File, defaults FileDefaults, existingStat os.FileInfo) (resolvedOwnership, error) { + res := resolvedOwnership{UID: -1, GID: -1} + + // Mode resolution. + switch { + case file.Mode != 0: + res.Mode = file.Mode + res.ModeSet = true + case existingStat != nil: + res.Mode = existingStat.Mode().Perm() + res.ModeSet = true + case defaults.Mode != 0: + res.Mode = defaults.Mode + res.ModeSet = true + default: + // Nothing to apply; AtomicWriteFile uses os.WriteFile's + // default 0644-ish for new files, preserves for existing. + res.Mode = 0 + res.ModeSet = false + } + + // Owner / group resolution. + owner, group := file.Owner, file.Group + switch { + case owner != "" && group != "": + // explicit override + case existingStat != nil: + // preserve existing — extract from sys-stat + uid, gid, ok := unixOwnerFromStat(existingStat) + if ok { + res.UID, res.GID = uid, gid + // Best-effort labels for logs (don't fail if user/group + // has been deleted from /etc/passwd between deploys). + if u, err := user.LookupId(strconv.Itoa(uid)); err == nil { + res.OwnerLabel = u.Username + } + if g, err := user.LookupGroupId(strconv.Itoa(gid)); err == nil { + res.GroupLabel = g.Name + } + } + return res, nil + case defaults.Owner != "" && defaults.Group != "": + owner, group = defaults.Owner, defaults.Group + default: + // No override, no existing file, no defaults — leave UID/GID + // at -1 so AtomicWriteFile skips the chown entirely. + return res, nil + } + + uid, err := lookupUID(owner) + if err != nil { + return res, fmt.Errorf("resolve owner %q: %w", owner, err) + } + gid, err := lookupGID(group) + if err != nil { + return res, fmt.Errorf("resolve group %q: %w", group, err) + } + res.UID, res.GID = uid, gid + res.OwnerLabel, res.GroupLabel = owner, group + return res, nil +} + +// applyOwnership applies the resolved (mode, uid, gid) to path. +// Both chown and chmod are best-effort: we attempt them, log +// warnings on failure, but do NOT fail the deploy. The agent runs +// as root in production; running as a regular user (CI / developer +// workstation) means chown to a different user fails with EPERM, +// which is expected and not actionable. The deploy semantically +// succeeded — only ownership lift was skipped. +// +// The "is this acceptable to silently swallow chown failure?" +// question is answered yes for two reasons: +// - In production (root agent), failures are real OS-level +// issues that show up in the audit log + Prometheus +// deploy_validate_failures_total counter. +// - In dev (non-root), failures are expected behavior; tests +// would otherwise need to be skipped or run with sudo. +// +// Connectors that NEED hard ownership enforcement (e.g. compliance +// audits) can wrap a stat-after-write check in their PostCommit. +func applyOwnership(path string, res resolvedOwnership) error { + if res.ModeSet { + if err := os.Chmod(path, res.Mode); err != nil { + return fmt.Errorf("chmod %s to %#o: %w", path, res.Mode, err) + } + } + if res.UID >= 0 && res.GID >= 0 { + if err := os.Chown(path, res.UID, res.GID); err != nil { + // EPERM in non-root contexts is expected. We surface + // the error to the caller, which decides whether to + // log + continue or hard-fail. Apply hard-fails the + // deploy on chown errors (the Plan asked for + // specific ownership; we couldn't deliver it; safer + // to roll back than to silently leave wrong perms). + return fmt.Errorf("chown %s to %d:%d: %w", path, res.UID, res.GID, err) + } + } + return nil +} + +// lookupUID resolves a username to a numeric uid. Accepts numeric +// strings ("1000") as a passthrough so the agent can accept either +// "nginx" or "1000" in operator config. +func lookupUID(username string) (int, error) { + if username == "" { + return -1, errors.New("empty username") + } + if uid, err := strconv.Atoi(username); err == nil { + return uid, nil + } + u, err := user.Lookup(username) + if err != nil { + return -1, err + } + uid, err := strconv.Atoi(u.Uid) + if err != nil { + return -1, fmt.Errorf("user %q has non-numeric uid %q: %w", username, u.Uid, err) + } + return uid, nil +} + +// lookupGID resolves a group name to a numeric gid. +func lookupGID(groupname string) (int, error) { + if groupname == "" { + return -1, errors.New("empty groupname") + } + if gid, err := strconv.Atoi(groupname); err == nil { + return gid, nil + } + g, err := user.LookupGroup(groupname) + if err != nil { + return -1, err + } + gid, err := strconv.Atoi(g.Gid) + if err != nil { + return -1, fmt.Errorf("group %q has non-numeric gid %q: %w", groupname, g.Gid, err) + } + return gid, nil +} + +// unixOwnerFromStat extracts (uid, gid) from a Unix-style FileInfo. +// On non-Unix platforms or when the underlying stat doesn't expose +// uid/gid, returns ok=false. +func unixOwnerFromStat(fi os.FileInfo) (uid int, gid int, ok bool) { + if fi == nil { + return -1, -1, false + } + if sysStat, isUnix := fi.Sys().(*syscall.Stat_t); isUnix { + return int(sysStat.Uid), int(sysStat.Gid), true + } + return -1, -1, false +} diff --git a/internal/deploy/types.go b/internal/deploy/types.go new file mode 100644 index 0000000..ba6451b --- /dev/null +++ b/internal/deploy/types.go @@ -0,0 +1,245 @@ +package deploy + +import ( + "context" + "errors" + "os" + "time" +) + +// Sentinel errors. All errors returned by Apply wrap exactly one of +// these so connector callers can use errors.Is to distinguish the +// failure mode without parsing strings. +var ( + // ErrValidateFailed is returned when the Plan's PreCommit hook + // returns an error. Connectors typically map PreCommit to a + // validate-with-the-target command (`nginx -t -c `, + // `apachectl configtest -f `, `haproxy -c -f `). + // On ErrValidateFailed, no live file has been touched: the temp + // files are cleaned up and the destinations are exactly as they + // were before Apply was called. + ErrValidateFailed = errors.New("deploy: validate (PreCommit) failed") + + // ErrReloadFailed is returned when the Plan's PostCommit hook + // returns an error AND the rollback succeeded. The destination + // files now hold the PREVIOUS bytes (restored from backup) and + // PostCommit was re-called against those bytes. The deploy is + // effectively a no-op from the operator's perspective. + ErrReloadFailed = errors.New("deploy: reload (PostCommit) failed; rolled back") + + // ErrRollbackFailed is the operator-actionable escalation: + // PostCommit failed, AND the rollback (restore + re-PostCommit) + // also failed. The deploy is in a known-bad state. Manual + // intervention is required to either restore the backup files + // (paths in Result.BackupPaths) or push a fresh known-good + // cert. Connectors emit a loud audit + alert when they see this. + ErrRollbackFailed = errors.New("deploy: reload failed AND rollback also failed; manual intervention required") + + // ErrPlanInvalid is returned for malformed Plans (no Files, + // duplicate destination paths, empty Path entries, etc.) before + // any I/O is performed. Strictly a programming error from the + // connector — never seen in production once the connector unit + // tests pass. + ErrPlanInvalid = errors.New("deploy: plan is invalid") +) + +// File describes one target file that Plan.Apply will write. +// +// When Mode is zero, the existing destination's mode is preserved if +// the destination exists; otherwise Plan.Defaults.Mode applies. Same +// for Owner / Group. This means connectors can ship a Plan with +// File{Path: ..., Bytes: ...} entries (no explicit ownership) and +// the package will Do The Right Thing — preserve nginx:nginx 0640 on +// renewal, fall back to per-target defaults on first deploy. +type File struct { + // Path is the final destination on disk. Must be an absolute + // path. The temp file used during atomic write is written in + // filepath.Dir(Path) to guarantee same-filesystem rename. + Path string + + // Bytes is the new contents to write. + Bytes []byte + + // Mode is the desired final file mode. Zero means "preserve + // existing or use Plan.Defaults.Mode for new files". + Mode os.FileMode + + // Owner is the username to chown to. Empty means "preserve + // existing or use Plan.Defaults.Owner for new files". Resolved + // at write time via os/user.Lookup. + Owner string + + // Group is the group name to chgrp to. Empty means "preserve + // existing or use Plan.Defaults.Group for new files". Resolved + // via os/user.LookupGroup. + Group string +} + +// FileDefaults applies to any File whose own Mode/Owner/Group is +// zero AND whose destination does not yet exist. Connectors set +// these to per-target-type sensible defaults (e.g. NGINX: +// {Mode: 0640, Owner: "nginx", Group: "nginx"}). +type FileDefaults struct { + Mode os.FileMode + Owner string + Group string +} + +// Plan represents one atomic deployment. All Files succeed together +// or roll back together. +type Plan struct { + // Files is the set of (path, contents, ownership) entries this + // Plan writes. Order is irrelevant — Apply writes them all + // before calling PreCommit, and atomically renames them all + // before calling PostCommit. + Files []File + + // Defaults applies to any File entry whose own Mode/Owner/Group + // fields are zero AND whose destination does not yet exist. + // When the destination already exists, the existing + // ownership/mode is preserved unless the File entry overrides. + Defaults FileDefaults + + // PreCommit is invoked after all temp files are written but + // BEFORE the atomic rename. The map argument is keyed by + // File.Path → temp file path so the connector can run a + // validate-with-the-target command against the temp file + // (e.g. `nginx -t -c `). Returning a non-nil error + // aborts the deploy: the temp files are cleaned up and Apply + // returns ErrValidateFailed wrapping the PreCommit error. + // + // Optional. nil PreCommit means "no validate step" — Apply + // proceeds straight to the atomic rename + PostCommit. + PreCommit func(ctx context.Context, tempPaths map[string]string) error + + // PostCommit is invoked after every File has been atomically + // renamed to its final path. Connectors typically map this to + // a service reload (`nginx -s reload`, `systemctl reload + // haproxy`). Returning a non-nil error triggers automatic + // rollback: the destinations are restored from the pre-deploy + // backups and PostCommit is called a second time against the + // restored bytes. If the second PostCommit also fails, Apply + // returns ErrRollbackFailed. + // + // Optional. nil PostCommit means "no reload step" — Apply + // returns immediately after the atomic rename. + PostCommit func(ctx context.Context) error + + // BackupRetention is the number of historical backups to keep + // per File path after a successful Apply. Older backups are + // garbage-collected by a synchronous janitor pass at the end + // of Apply. + // + // Zero (the field default) maps to DefaultBackupRetention (3). + // Set to a sentinel negative value (-1) to disable backups + // entirely — rollback becomes impossible; ErrReloadFailed is + // instead surfaced as a hard error with no recovery. + BackupRetention int + + // SkipIdempotent forces Apply to run PreCommit + PostCommit + // even when every File's bytes already match the destination. + // Useful when the connector knows an external configuration + // change requires re-validation. Defaults to false (skip on + // SHA-256 match — the safe and usual case). + SkipIdempotent bool +} + +// Result describes what Apply did. Connectors populate audit logs +// and Prometheus counters from this. +type Result struct { + // SkippedAsIdempotent is true when every File's destination + // already had identical bytes and SkipIdempotent was false. + // PreCommit and PostCommit were NOT called. BackupPaths is + // empty in this case — no backups are created for a no-op. + SkippedAsIdempotent bool + + // BackupPaths maps each File.Path to the path of the backup + // of the previous contents. When a destination did not exist + // before Apply, the entry maps to "" (no backup possible). + // Empty when SkippedAsIdempotent is true. + BackupPaths map[string]string + + // ValidateOK is true when PreCommit returned nil (or was nil + // to begin with). + ValidateOK bool + + // Reloaded is true when PostCommit returned nil (or was nil) + // AND no rollback occurred. + Reloaded bool + + // RolledBack is true when PostCommit failed AND the rollback + // succeeded. ErrReloadFailed will be returned alongside. + RolledBack bool + + // Duration is the wall-clock time Apply took, including + // PreCommit + PostCommit + (if applicable) rollback. + Duration time.Duration +} + +// WriteOptions controls AtomicWriteFile, the lower-level building +// block exposed for connectors that don't fit the Plan model +// (typically connectors that ship bytes through a remote API rather +// than a local filesystem — F5, K8s). +type WriteOptions struct { + // Mode is the desired final file mode. Zero = preserve + // existing or use DefaultMode for new files. + Mode os.FileMode + + // DefaultMode applies when Mode is zero AND the destination + // does not yet exist. + DefaultMode os.FileMode + + // Owner / Group: empty = preserve existing or use + // DefaultOwner/Group for new files. + Owner string + Group string + DefaultOwner string + DefaultGroup string + + // SkipIdempotent forces a write even when the destination + // already has identical bytes. Defaults to false. + SkipIdempotent bool + + // BackupRetention controls how many historical backups to + // keep. Zero = DefaultBackupRetention (3); -1 = no backups. + BackupRetention int +} + +// WriteResult describes what AtomicWriteFile did. +type WriteResult struct { + // Path is the final destination (echoed for caller convenience). + Path string + + // BackupPath is the path to the pre-write backup, or "" when + // no backup was taken (file did not exist or backups disabled + // or write was idempotent-skipped). + BackupPath string + + // Replaced is true when an existing file was replaced. False + // when the file did not previously exist OR the write was + // idempotent-skipped. + Replaced bool + + // Idempotent is true when the destination already had + // identical bytes and SkipIdempotent was false. No write + // occurred in this case. + Idempotent bool +} + +// DefaultBackupRetention is the number of historical backup files +// kept per File path after a successful Apply (or +// AtomicWriteFile call). Operators can override per-call via +// Plan.BackupRetention or via the CERTCTL_DEPLOY_BACKUP_RETENTION +// env var that the agent passes in. +const DefaultBackupRetention = 3 + +// BackupSuffix is the suffix used for pre-write backup files. +// Format: .certctl-bak.. The unix-nanos is +// monotonic enough for retention sort order (lexicographic = +// chronological) without needing per-file metadata. +const BackupSuffix = ".certctl-bak." + +// TempSuffix is the suffix used for in-flight temp files. Format: +// .certctl-tmp.. Cleaned up on PreCommit +// failure or on Apply panic. +const TempSuffix = ".certctl-tmp."