mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 15:01:32 +00:00
feat(deploy): atomic write + validate + rollback primitive shared across all target connectors
Phase 1 of the deploy-hardening I master bundle. Closes the load-bearing prerequisite for the seven Bundle I items by extracting one canonical atomic-deploy primitive at internal/deploy/ that all 13 target connectors will consume in Phases 4-9. The package ships: - Plan + Apply API: write all File entries to sibling .certctl-tmp.<nanos> in the destination directory (same-filesystem guarantees os.Rename atomicity), call PreCommit (validate-with-the-target), atomic-rename all temps to final, call PostCommit (reload). On PostCommit failure, restore from pre-deploy backups + re-call PostCommit. If second PostCommit also fails, return ErrRollbackFailed (operator-actionable; documented loud). - AtomicWriteFile lower-level entry for connectors that don't fit the Plan model (F5, K8s — they ship bytes through APIs, not local files). - SHA-256 idempotency: every Apply short-circuits when all File destinations already match SHA-256 of new bytes. Defends against agent-restart retry storms hammering targets with no-op reloads. - Ownership + mode preservation: existing nginx:nginx 0640 stays nginx:nginx 0640 across renewals. Per-target FileDefaults applies for first-deploy. Per-File explicit Mode/Owner/Group overrides win over both. Closes the silent-failure mode where os.WriteFile(path, bytes, 0600) at apache.go:119 (et al.) clobbered worker access. - Backup retention janitor: pre-deploy backup at <path>.certctl-bak.<nanos>; default keeps last 3 (DefaultBackupRetention); BackupRetention=-1 disables backups (rollback impossible — documented foot-gun). - File-level mutex via sync.Map: two concurrent Apply calls touching the same destination serialize. Per-target serialization (Phase 2) is finer- grained at the agent dispatch layer; this is the file-level guard. - Sentinel errors for connector errors.Is checks: ErrPlanInvalid, ErrValidateFailed, ErrReloadFailed, ErrRollbackFailed. Tests (37 named cases across deploy_test.go + coverage_test.go) pin every load-bearing invariant the prompt's Phase 1 requires, plus error-leg coverage uplifts: - TestApply_HappyPath_PreCommitSucceeds_PostCommitSucceeds_FilesAtomic - TestApply_PreCommitFails_NoFilesChanged (atomic-or-nothing on validate) - TestApply_PostCommitFails_FilesRolledBack (rollback wire) - TestApply_RollbackAlsoFails_ReturnsErrRollbackFailed (escalation path) - TestApply_IdempotentSkip_SHA256Match (idempotency short-circuit) - TestApply_PreservesExistingOwnerAndMode_WhenNotOverridden - TestApply_RespectsOverrides_OwnerGroupMode - TestApply_ConcurrentApplyToSameFile_Serializes (file-level lock) - TestApply_BackupRetention_KeepsLastN (janitor pruning) - TestApply_NoExistingFile_UsesDefaultsForOwnerGroupMode - TestAtomicWriteFile_TempFileCleanedUpOnError - TestAtomicWriteFile_RenameRaceWithReader_AtomicReadAlwaysSeesOldOrNew (POSIX-rename atomicity proof via concurrent reader) Plus white-box tests for resolveOwnership, lookupUID/GID, and deeper error legs in restoreFromBackups + applyOwnership + AtomicWriteFile. Coverage 87.3% — practical ceiling without injecting a fault-aware FS abstraction (Write/Sync/Close OS errors are unreachable from go test without sudo'd disk-fill or a custom interface seam). Above the existing service-layer 70% floor; Phases 4-9 will lift this further as they exercise the package through real-connector use. Race detector clean; gofmt + go vet + golangci-lint v2.11.4 all 0 issues. The package is the load-bearing prerequisite for Phases 4-9. Phase 2 next: per-target deploy mutex in cmd/agent/main.go. Spec: cowork/deploy-hardening-i-prompt.md Baseline + recon: cowork/deploy-hardening-i/baseline.md
This commit is contained in:
@@ -0,0 +1,298 @@
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// fileMutexes serializes concurrent Apply / AtomicWriteFile calls
|
||||
// against the same destination path. Coarse-grained file-level lock
|
||||
// — sufficient for cert deploy throughput (operator-grade tens per
|
||||
// minute, not high-throughput).
|
||||
//
|
||||
// Per-target serialization (Phase 2) is a separate concern at the
|
||||
// agent dispatch layer; this file-level lock defends against
|
||||
// accidental same-path racing within a single connector pipeline.
|
||||
var fileMutexes sync.Map // map[string]*sync.Mutex
|
||||
|
||||
func lockFile(path string) func() {
|
||||
abs, err := filepath.Abs(path)
|
||||
if err != nil {
|
||||
abs = path
|
||||
}
|
||||
v, _ := fileMutexes.LoadOrStore(abs, &sync.Mutex{})
|
||||
mu := v.(*sync.Mutex)
|
||||
mu.Lock()
|
||||
return mu.Unlock
|
||||
}
|
||||
|
||||
// AtomicWriteFile writes data to path atomically.
|
||||
//
|
||||
// Algorithm:
|
||||
//
|
||||
// 1. Acquire the package-internal file-level mutex for path.
|
||||
// 2. SHA-256 short-circuit: if path exists and has identical bytes
|
||||
// and !opts.SkipIdempotent, return WriteResult{Idempotent: true}
|
||||
// with no I/O.
|
||||
// 3. Resolve final ownership (mode/uid/gid) per the precedence in
|
||||
// resolveOwnership.
|
||||
// 4. Write to <path>.certctl-tmp.<unix-nanos> in filepath.Dir(path)
|
||||
// (same-filesystem guarantees os.Rename atomicity).
|
||||
// 5. fsync the temp file (durability across power loss).
|
||||
// 6. Apply chmod / chown to the temp file BEFORE rename (so the
|
||||
// atomic-rename atomically swaps in a fully-permissioned file).
|
||||
// 7. Backup the existing destination to
|
||||
// <path>.certctl-bak.<unix-nanos> (skipped when destination did
|
||||
// not exist OR opts.BackupRetention == -1).
|
||||
// 8. os.Rename(temp, path) — atomic on POSIX same-filesystem.
|
||||
// 9. Janitor pass: prune backups beyond retention.
|
||||
//
|
||||
// Returns ErrPlanInvalid for malformed inputs (empty path, empty
|
||||
// data + nil-with-existing-file ambiguity is preserved — empty
|
||||
// data writes an empty file).
|
||||
func AtomicWriteFile(ctx context.Context, path string, data []byte, opts WriteOptions) (*WriteResult, error) {
|
||||
if path == "" {
|
||||
return nil, fmt.Errorf("%w: empty path", ErrPlanInvalid)
|
||||
}
|
||||
abs, err := filepath.Abs(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("resolve path: %w", err)
|
||||
}
|
||||
|
||||
unlock := lockFile(abs)
|
||||
defer unlock()
|
||||
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := &WriteResult{Path: abs}
|
||||
|
||||
// 2. Idempotency check.
|
||||
existingStat, statErr := os.Stat(abs)
|
||||
existed := statErr == nil
|
||||
if existed && !opts.SkipIdempotent {
|
||||
existingBytes, err := os.ReadFile(abs)
|
||||
if err == nil && sha256Eq(existingBytes, data) {
|
||||
res.Idempotent = true
|
||||
return res, nil
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Resolve ownership.
|
||||
owner, err := resolveOwnership(File{
|
||||
Path: abs,
|
||||
Bytes: data,
|
||||
Mode: opts.Mode,
|
||||
Owner: opts.Owner,
|
||||
Group: opts.Group,
|
||||
}, FileDefaults{
|
||||
Mode: opts.DefaultMode,
|
||||
Owner: opts.DefaultOwner,
|
||||
Group: opts.DefaultGroup,
|
||||
}, ownershipStat(existingStat, statErr))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("resolve ownership: %w", err)
|
||||
}
|
||||
|
||||
// 4. Write to temp in same dir.
|
||||
tempPath, err := writeTempFile(abs, data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("write temp: %w", err)
|
||||
}
|
||||
tempCleanup := func() { _ = os.Remove(tempPath) }
|
||||
defer func() {
|
||||
// On any error path we want to remove the temp file. Successful
|
||||
// rename moves it away, so this remove is a no-op on success.
|
||||
// We don't care about the error from the cleanup.
|
||||
tempCleanup()
|
||||
}()
|
||||
|
||||
// 5. Apply ownership to temp BEFORE rename so the rename
|
||||
// atomically swaps in a properly-permissioned file (no
|
||||
// brief window where the destination has wrong perms).
|
||||
if err := applyOwnership(tempPath, owner); err != nil {
|
||||
return nil, fmt.Errorf("apply ownership to temp: %w", err)
|
||||
}
|
||||
|
||||
// 6. Backup existing destination.
|
||||
if existed && opts.BackupRetention != -1 {
|
||||
backupPath, err := backupFile(abs)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("backup existing: %w", err)
|
||||
}
|
||||
res.BackupPath = backupPath
|
||||
}
|
||||
|
||||
// 7. Atomic rename. On the rare case Rename fails after backup,
|
||||
// we leave the backup in place (operator can manually restore).
|
||||
if err := os.Rename(tempPath, abs); err != nil {
|
||||
return nil, fmt.Errorf("atomic rename: %w", err)
|
||||
}
|
||||
res.Replaced = existed
|
||||
|
||||
// 8. Janitor: prune backups beyond retention.
|
||||
retention := opts.BackupRetention
|
||||
if retention == 0 {
|
||||
retention = DefaultBackupRetention
|
||||
}
|
||||
if retention > 0 {
|
||||
if err := pruneBackups(abs, retention); err != nil {
|
||||
// Janitor errors are non-fatal — the deploy succeeded.
|
||||
// Surface only if the caller wired a logger somewhere
|
||||
// upstream. We choose to swallow and continue.
|
||||
_ = err
|
||||
}
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// ownershipStat returns nil when the destination didn't exist,
|
||||
// otherwise the os.FileInfo. Encapsulates the existed/not-existed
|
||||
// branch so resolveOwnership's signature stays clean.
|
||||
func ownershipStat(fi os.FileInfo, statErr error) os.FileInfo {
|
||||
if statErr != nil {
|
||||
if errors.Is(statErr, os.ErrNotExist) {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return fi
|
||||
}
|
||||
|
||||
// writeTempFile writes data to <abs>.certctl-tmp.<unix-nanos> in
|
||||
// the same directory as abs. Returns the temp path. fsync's the
|
||||
// file before close to defend against power-loss-during-rename
|
||||
// corruption (rename guarantees atomic visibility but the file's
|
||||
// data blocks must be on disk first).
|
||||
func writeTempFile(abs string, data []byte) (string, error) {
|
||||
dir := filepath.Dir(abs)
|
||||
base := filepath.Base(abs)
|
||||
tempName := base + TempSuffix + nowNanosStr()
|
||||
tempPath := filepath.Join(dir, tempName)
|
||||
|
||||
// O_WRONLY|O_CREATE|O_EXCL guarantees we don't clobber a
|
||||
// half-written temp from a concurrent AtomicWriteFile call.
|
||||
// fileMutexes already serialize same-abs callers; O_EXCL is
|
||||
// belt-and-braces for the "wow, monotonic clock collided"
|
||||
// corner case.
|
||||
f, err := os.OpenFile(tempPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0600)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if _, err := f.Write(data); err != nil {
|
||||
_ = f.Close()
|
||||
_ = os.Remove(tempPath)
|
||||
return "", err
|
||||
}
|
||||
// fsync defends against power-loss between rename + data flush.
|
||||
// On POSIX, rename's atomicity is metadata-only — the new file's
|
||||
// data must be on disk first or a power-loss-then-recover sees
|
||||
// an empty file at the destination.
|
||||
if err := f.Sync(); err != nil {
|
||||
_ = f.Close()
|
||||
_ = os.Remove(tempPath)
|
||||
return "", err
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
_ = os.Remove(tempPath)
|
||||
return "", err
|
||||
}
|
||||
return tempPath, nil
|
||||
}
|
||||
|
||||
// backupFile copies abs's current bytes to
|
||||
// <abs>.certctl-bak.<unix-nanos>. Used by AtomicWriteFile as a
|
||||
// pre-write snapshot for rollback.
|
||||
func backupFile(abs string) (string, error) {
|
||||
src, err := os.ReadFile(abs)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read for backup: %w", err)
|
||||
}
|
||||
srcStat, err := os.Stat(abs)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("stat for backup: %w", err)
|
||||
}
|
||||
dir := filepath.Dir(abs)
|
||||
base := filepath.Base(abs)
|
||||
backupName := base + BackupSuffix + nowNanosStr()
|
||||
backupPath := filepath.Join(dir, backupName)
|
||||
if err := os.WriteFile(backupPath, src, srcStat.Mode().Perm()); err != nil {
|
||||
return "", fmt.Errorf("write backup: %w", err)
|
||||
}
|
||||
// Best-effort: preserve uid/gid of the original. The backup is
|
||||
// for emergency restore; if we can't chown (non-root + chown
|
||||
// denied), the operator can still cat/diff it as the agent user.
|
||||
if uid, gid, ok := unixOwnerFromStat(srcStat); ok {
|
||||
_ = os.Chown(backupPath, uid, gid)
|
||||
}
|
||||
return backupPath, nil
|
||||
}
|
||||
|
||||
// pruneBackups deletes older backups for abs, keeping the most
|
||||
// recent `keep` entries. Sorted lexicographically — which is also
|
||||
// chronological because nowNanosStr is monotonic-ish.
|
||||
func pruneBackups(abs string, keep int) error {
|
||||
if keep <= 0 {
|
||||
return nil
|
||||
}
|
||||
dir := filepath.Dir(abs)
|
||||
base := filepath.Base(abs)
|
||||
prefix := base + BackupSuffix
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var matches []string
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(e.Name(), prefix) {
|
||||
matches = append(matches, e.Name())
|
||||
}
|
||||
}
|
||||
if len(matches) <= keep {
|
||||
return nil
|
||||
}
|
||||
sort.Strings(matches)
|
||||
// Older ones come first; trim to keep the last `keep`.
|
||||
toRemove := matches[:len(matches)-keep]
|
||||
var firstErr error
|
||||
for _, name := range toRemove {
|
||||
if err := os.Remove(filepath.Join(dir, name)); err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
return firstErr
|
||||
}
|
||||
|
||||
// sha256Eq returns true when two byte slices have identical
|
||||
// SHA-256 hashes. We compute both side hashes (rather than
|
||||
// bytes.Equal directly) because the call sites typically already
|
||||
// have a "hash for the wire" need elsewhere — keeping the same
|
||||
// primitive everywhere makes future audit-log entries consistent.
|
||||
func sha256Eq(a, b []byte) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
ha := sha256.Sum256(a)
|
||||
hb := sha256.Sum256(b)
|
||||
return ha == hb
|
||||
}
|
||||
|
||||
// nowNanosStr returns time.Now().UnixNano() formatted as a
|
||||
// fixed-width zero-padded decimal so lexicographic sort matches
|
||||
// chronological order. The padding matters for pruneBackups —
|
||||
// without it, "100" would sort before "99".
|
||||
func nowNanosStr() string {
|
||||
return fmt.Sprintf("%019d", time.Now().UnixNano())
|
||||
}
|
||||
Reference in New Issue
Block a user