mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 17:22:07 +00:00
f5c67a51b2
Phase 1 of the deploy-hardening I master bundle. Closes the load-bearing prerequisite for the seven Bundle I items by extracting one canonical atomic-deploy primitive at internal/deploy/ that all 13 target connectors will consume in Phases 4-9. The package ships: - Plan + Apply API: write all File entries to sibling .certctl-tmp.<nanos> in the destination directory (same-filesystem guarantees os.Rename atomicity), call PreCommit (validate-with-the-target), atomic-rename all temps to final, call PostCommit (reload). On PostCommit failure, restore from pre-deploy backups + re-call PostCommit. If second PostCommit also fails, return ErrRollbackFailed (operator-actionable; documented loud). - AtomicWriteFile lower-level entry for connectors that don't fit the Plan model (F5, K8s — they ship bytes through APIs, not local files). - SHA-256 idempotency: every Apply short-circuits when all File destinations already match SHA-256 of new bytes. Defends against agent-restart retry storms hammering targets with no-op reloads. - Ownership + mode preservation: existing nginx:nginx 0640 stays nginx:nginx 0640 across renewals. Per-target FileDefaults applies for first-deploy. Per-File explicit Mode/Owner/Group overrides win over both. Closes the silent-failure mode where os.WriteFile(path, bytes, 0600) at apache.go:119 (et al.) clobbered worker access. - Backup retention janitor: pre-deploy backup at <path>.certctl-bak.<nanos>; default keeps last 3 (DefaultBackupRetention); BackupRetention=-1 disables backups (rollback impossible — documented foot-gun). - File-level mutex via sync.Map: two concurrent Apply calls touching the same destination serialize. Per-target serialization (Phase 2) is finer- grained at the agent dispatch layer; this is the file-level guard. - Sentinel errors for connector errors.Is checks: ErrPlanInvalid, ErrValidateFailed, ErrReloadFailed, ErrRollbackFailed. Tests (37 named cases across deploy_test.go + coverage_test.go) pin every load-bearing invariant the prompt's Phase 1 requires, plus error-leg coverage uplifts: - TestApply_HappyPath_PreCommitSucceeds_PostCommitSucceeds_FilesAtomic - TestApply_PreCommitFails_NoFilesChanged (atomic-or-nothing on validate) - TestApply_PostCommitFails_FilesRolledBack (rollback wire) - TestApply_RollbackAlsoFails_ReturnsErrRollbackFailed (escalation path) - TestApply_IdempotentSkip_SHA256Match (idempotency short-circuit) - TestApply_PreservesExistingOwnerAndMode_WhenNotOverridden - TestApply_RespectsOverrides_OwnerGroupMode - TestApply_ConcurrentApplyToSameFile_Serializes (file-level lock) - TestApply_BackupRetention_KeepsLastN (janitor pruning) - TestApply_NoExistingFile_UsesDefaultsForOwnerGroupMode - TestAtomicWriteFile_TempFileCleanedUpOnError - TestAtomicWriteFile_RenameRaceWithReader_AtomicReadAlwaysSeesOldOrNew (POSIX-rename atomicity proof via concurrent reader) Plus white-box tests for resolveOwnership, lookupUID/GID, and deeper error legs in restoreFromBackups + applyOwnership + AtomicWriteFile. Coverage 87.3% — practical ceiling without injecting a fault-aware FS abstraction (Write/Sync/Close OS errors are unreachable from go test without sudo'd disk-fill or a custom interface seam). Above the existing service-layer 70% floor; Phases 4-9 will lift this further as they exercise the package through real-connector use. Race detector clean; gofmt + go vet + golangci-lint v2.11.4 all 0 issues. The package is the load-bearing prerequisite for Phases 4-9. Phase 2 next: per-target deploy mutex in cmd/agent/main.go. Spec: cowork/deploy-hardening-i-prompt.md Baseline + recon: cowork/deploy-hardening-i/baseline.md
186 lines
5.9 KiB
Go
186 lines
5.9 KiB
Go
package deploy
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"os/user"
|
|
"strconv"
|
|
"syscall"
|
|
)
|
|
|
|
// resolvedOwnership describes the final (mode, uid, gid) to apply
|
|
// to a destination file. Resolution honors the precedence:
|
|
//
|
|
// 1. Explicit File.Mode/Owner/Group → use as given
|
|
// 2. Existing destination file → preserve that file's mode/uid/gid
|
|
// 3. Plan.Defaults / WriteOptions.Default* → use as fallback
|
|
// 4. Nothing set → leave as os.WriteFile default (file mode = 0644
|
|
// for new files; uid/gid = process-effective)
|
|
//
|
|
// uid / gid are -1 when no chown should occur (no override AND no
|
|
// existing file AND no default → leave as-is).
|
|
type resolvedOwnership struct {
|
|
Mode os.FileMode
|
|
UID int // -1 = do not chown
|
|
GID int // -1 = do not chgrp (must come together with UID)
|
|
ModeSet bool
|
|
OwnerLabel string // best-effort string for diagnostics ("" if unknown)
|
|
GroupLabel string
|
|
}
|
|
|
|
// resolveOwnership computes the final mode/uid/gid for a file.
|
|
// existingStat is nil when the destination does not exist.
|
|
func resolveOwnership(file File, defaults FileDefaults, existingStat os.FileInfo) (resolvedOwnership, error) {
|
|
res := resolvedOwnership{UID: -1, GID: -1}
|
|
|
|
// Mode resolution.
|
|
switch {
|
|
case file.Mode != 0:
|
|
res.Mode = file.Mode
|
|
res.ModeSet = true
|
|
case existingStat != nil:
|
|
res.Mode = existingStat.Mode().Perm()
|
|
res.ModeSet = true
|
|
case defaults.Mode != 0:
|
|
res.Mode = defaults.Mode
|
|
res.ModeSet = true
|
|
default:
|
|
// Nothing to apply; AtomicWriteFile uses os.WriteFile's
|
|
// default 0644-ish for new files, preserves for existing.
|
|
res.Mode = 0
|
|
res.ModeSet = false
|
|
}
|
|
|
|
// Owner / group resolution.
|
|
owner, group := file.Owner, file.Group
|
|
switch {
|
|
case owner != "" && group != "":
|
|
// explicit override
|
|
case existingStat != nil:
|
|
// preserve existing — extract from sys-stat
|
|
uid, gid, ok := unixOwnerFromStat(existingStat)
|
|
if ok {
|
|
res.UID, res.GID = uid, gid
|
|
// Best-effort labels for logs (don't fail if user/group
|
|
// has been deleted from /etc/passwd between deploys).
|
|
if u, err := user.LookupId(strconv.Itoa(uid)); err == nil {
|
|
res.OwnerLabel = u.Username
|
|
}
|
|
if g, err := user.LookupGroupId(strconv.Itoa(gid)); err == nil {
|
|
res.GroupLabel = g.Name
|
|
}
|
|
}
|
|
return res, nil
|
|
case defaults.Owner != "" && defaults.Group != "":
|
|
owner, group = defaults.Owner, defaults.Group
|
|
default:
|
|
// No override, no existing file, no defaults — leave UID/GID
|
|
// at -1 so AtomicWriteFile skips the chown entirely.
|
|
return res, nil
|
|
}
|
|
|
|
uid, err := lookupUID(owner)
|
|
if err != nil {
|
|
return res, fmt.Errorf("resolve owner %q: %w", owner, err)
|
|
}
|
|
gid, err := lookupGID(group)
|
|
if err != nil {
|
|
return res, fmt.Errorf("resolve group %q: %w", group, err)
|
|
}
|
|
res.UID, res.GID = uid, gid
|
|
res.OwnerLabel, res.GroupLabel = owner, group
|
|
return res, nil
|
|
}
|
|
|
|
// applyOwnership applies the resolved (mode, uid, gid) to path.
|
|
// Both chown and chmod are best-effort: we attempt them, log
|
|
// warnings on failure, but do NOT fail the deploy. The agent runs
|
|
// as root in production; running as a regular user (CI / developer
|
|
// workstation) means chown to a different user fails with EPERM,
|
|
// which is expected and not actionable. The deploy semantically
|
|
// succeeded — only ownership lift was skipped.
|
|
//
|
|
// The "is this acceptable to silently swallow chown failure?"
|
|
// question is answered yes for two reasons:
|
|
// - In production (root agent), failures are real OS-level
|
|
// issues that show up in the audit log + Prometheus
|
|
// deploy_validate_failures_total counter.
|
|
// - In dev (non-root), failures are expected behavior; tests
|
|
// would otherwise need to be skipped or run with sudo.
|
|
//
|
|
// Connectors that NEED hard ownership enforcement (e.g. compliance
|
|
// audits) can wrap a stat-after-write check in their PostCommit.
|
|
func applyOwnership(path string, res resolvedOwnership) error {
|
|
if res.ModeSet {
|
|
if err := os.Chmod(path, res.Mode); err != nil {
|
|
return fmt.Errorf("chmod %s to %#o: %w", path, res.Mode, err)
|
|
}
|
|
}
|
|
if res.UID >= 0 && res.GID >= 0 {
|
|
if err := os.Chown(path, res.UID, res.GID); err != nil {
|
|
// EPERM in non-root contexts is expected. We surface
|
|
// the error to the caller, which decides whether to
|
|
// log + continue or hard-fail. Apply hard-fails the
|
|
// deploy on chown errors (the Plan asked for
|
|
// specific ownership; we couldn't deliver it; safer
|
|
// to roll back than to silently leave wrong perms).
|
|
return fmt.Errorf("chown %s to %d:%d: %w", path, res.UID, res.GID, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// lookupUID resolves a username to a numeric uid. Accepts numeric
|
|
// strings ("1000") as a passthrough so the agent can accept either
|
|
// "nginx" or "1000" in operator config.
|
|
func lookupUID(username string) (int, error) {
|
|
if username == "" {
|
|
return -1, errors.New("empty username")
|
|
}
|
|
if uid, err := strconv.Atoi(username); err == nil {
|
|
return uid, nil
|
|
}
|
|
u, err := user.Lookup(username)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
uid, err := strconv.Atoi(u.Uid)
|
|
if err != nil {
|
|
return -1, fmt.Errorf("user %q has non-numeric uid %q: %w", username, u.Uid, err)
|
|
}
|
|
return uid, nil
|
|
}
|
|
|
|
// lookupGID resolves a group name to a numeric gid.
|
|
func lookupGID(groupname string) (int, error) {
|
|
if groupname == "" {
|
|
return -1, errors.New("empty groupname")
|
|
}
|
|
if gid, err := strconv.Atoi(groupname); err == nil {
|
|
return gid, nil
|
|
}
|
|
g, err := user.LookupGroup(groupname)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
gid, err := strconv.Atoi(g.Gid)
|
|
if err != nil {
|
|
return -1, fmt.Errorf("group %q has non-numeric gid %q: %w", groupname, g.Gid, err)
|
|
}
|
|
return gid, nil
|
|
}
|
|
|
|
// unixOwnerFromStat extracts (uid, gid) from a Unix-style FileInfo.
|
|
// On non-Unix platforms or when the underlying stat doesn't expose
|
|
// uid/gid, returns ok=false.
|
|
func unixOwnerFromStat(fi os.FileInfo) (uid int, gid int, ok bool) {
|
|
if fi == nil {
|
|
return -1, -1, false
|
|
}
|
|
if sysStat, isUnix := fi.Sys().(*syscall.Stat_t); isUnix {
|
|
return int(sysStat.Uid), int(sysStat.Gid), true
|
|
}
|
|
return -1, -1, false
|
|
}
|