mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 14:51:30 +00:00
8e84527ba2
CI's cross-platform-build (windows-latest) job has been red for
several runs:
internal/deploy/ownership.go:205 — undefined: syscall.Stat_t
Root cause:
`syscall.Stat_t` is the Unix-specific POSIX stat-struct shape
(linux / darwin / freebsd / openbsd / netbsd / dragonfly /
solaris all expose it). On Windows GOOS, the syscall package
defines `syscall.Win32FileAttributeData` instead, which carries
no uid/gid fields. Any production tsx that names `syscall.Stat_t`
unconditionally fails to compile on GOOS=windows.
The function was added pre-cross-platform-matrix and never had
to compile for Windows; CI's `cross-platform-build` job (added
by Phase 3 TEST-H2) is what surfaced it. The ubuntu / macos
matrix runs stayed green because both GOOSes expose the type.
Fix (standard Go per-platform build-tag split):
Move `unixOwnerFromStat(fi os.FileInfo) (uid, gid int, ok bool)`
out of ownership.go into per-OS sibling files:
internal/deploy/ownership_unix.go //go:build unix
internal/deploy/ownership_windows.go //go:build windows
ownership_unix.go: same impl as before. Uses `syscall.Stat_t`.
Covers every Unix-y GOOS via Go 1.19+'s `unix` build constraint
(linux + darwin + freebsd + openbsd + netbsd + dragonfly +
solaris).
ownership_windows.go: stub that returns (-1, -1, false). Windows
has no native uid/gid; file ownership is expressed via SIDs +
ACLs (`syscall.Win32FileAttributeData`), which the deploy
package's call sites can't translate into uid/gid anyway. All
four callers — applyOwnership (ownership.go:75),
preserveSourceOwner (atomic.go:237), and two test sites — ALREADY
handle ok=false by falling back to Plan.Defaults / runtime
umask. Stub returning false is the correct platform contract.
ownership.go: drop the `syscall` import (no longer needed there)
+ replace the function body with a doc comment pointing to the
per-OS files so future readers know where the impl lives.
Note: the agent binary still compiles + runs on Windows; the
chown/chmod codepaths in the deploy package gate on
`runningAsRoot()` (os.Geteuid() == 0) which is also Unix-only in
practice — Windows agents run as a service under a SID that
doesn't translate to a uid anyway, so ownership operations on
Windows naturally no-op.
Verification (Go toolchain wired in sandbox, sub-platform builds
ran locally):
• gofmt -l on all three touched files — clean
• GOOS=linux GOARCH=amd64 go build ./internal/deploy/... — exit 0
• GOOS=darwin GOARCH=amd64 go build ./internal/deploy/... — exit 0
• GOOS=windows GOARCH=amd64 go build ./internal/deploy/... — exit 0
• GOOS=windows GOARCH=amd64 go build ./cmd/{server,agent,cli,mcp-server}/...
— exit 0 (all four CI matrix targets)
• go vet ./internal/deploy/... — exit 0
• staticcheck ./internal/deploy/... — zero findings
• go test -short -count=1 ./internal/deploy/... — ok 0.216s (the
four callers' tests all still pass on Linux)
Ground-truth: origin/master tip 622c19c (TEST-H3 just pushed)
verified via GitHub API BEFORE commit.
Falsifiable proof for the next CI run: the windows-latest leg of
cross-platform-build should turn green. The ubuntu-latest and
macos-latest legs were already green; this fix doesn't touch
their build path.
210 lines
7.0 KiB
Go
210 lines
7.0 KiB
Go
// Copyright 2026 certctl LLC. All rights reserved.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package deploy
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"os/user"
|
|
"strconv"
|
|
)
|
|
|
|
// runningAsRoot reports whether the current process has uid 0.
|
|
// Used by applyOwnership to decide whether chown EPERM is fatal
|
|
// (we're root and SHOULD have been allowed; bug) vs ignorable
|
|
// (we're a regular user; chown to a different uid will always
|
|
// fail; not actionable). Operators run agents as root in
|
|
// production, so this fork only hides EPERM in dev/CI.
|
|
func runningAsRoot() bool {
|
|
return os.Geteuid() == 0
|
|
}
|
|
|
|
// resolvedOwnership describes the final (mode, uid, gid) to apply
|
|
// to a destination file. Resolution honors the precedence:
|
|
//
|
|
// 1. Explicit File.Mode/Owner/Group → use as given
|
|
// 2. Existing destination file → preserve that file's mode/uid/gid
|
|
// 3. Plan.Defaults / WriteOptions.Default* → use as fallback
|
|
// 4. Nothing set → leave as os.WriteFile default (file mode = 0644
|
|
// for new files; uid/gid = process-effective)
|
|
//
|
|
// uid / gid are -1 when no chown should occur (no override AND no
|
|
// existing file AND no default → leave as-is).
|
|
type resolvedOwnership struct {
|
|
Mode os.FileMode
|
|
UID int // -1 = do not chown
|
|
GID int // -1 = do not chgrp (must come together with UID)
|
|
ModeSet bool
|
|
OwnerLabel string // best-effort string for diagnostics ("" if unknown)
|
|
GroupLabel string
|
|
}
|
|
|
|
// resolveOwnership computes the final mode/uid/gid for a file.
|
|
// existingStat is nil when the destination does not exist.
|
|
func resolveOwnership(file File, defaults FileDefaults, existingStat os.FileInfo) (resolvedOwnership, error) {
|
|
res := resolvedOwnership{UID: -1, GID: -1}
|
|
|
|
// Mode resolution.
|
|
switch {
|
|
case file.Mode != 0:
|
|
res.Mode = file.Mode
|
|
res.ModeSet = true
|
|
case existingStat != nil:
|
|
res.Mode = existingStat.Mode().Perm()
|
|
res.ModeSet = true
|
|
case defaults.Mode != 0:
|
|
res.Mode = defaults.Mode
|
|
res.ModeSet = true
|
|
default:
|
|
// Nothing to apply; AtomicWriteFile uses os.WriteFile's
|
|
// default 0644-ish for new files, preserves for existing.
|
|
res.Mode = 0
|
|
res.ModeSet = false
|
|
}
|
|
|
|
// Owner / group resolution.
|
|
owner, group := file.Owner, file.Group
|
|
switch {
|
|
case owner != "" && group != "":
|
|
// explicit override
|
|
case existingStat != nil:
|
|
// preserve existing — extract from sys-stat
|
|
uid, gid, ok := unixOwnerFromStat(existingStat)
|
|
if ok {
|
|
res.UID, res.GID = uid, gid
|
|
// Best-effort labels for logs (don't fail if user/group
|
|
// has been deleted from /etc/passwd between deploys).
|
|
if u, err := user.LookupId(strconv.Itoa(uid)); err == nil {
|
|
res.OwnerLabel = u.Username
|
|
}
|
|
if g, err := user.LookupGroupId(strconv.Itoa(gid)); err == nil {
|
|
res.GroupLabel = g.Name
|
|
}
|
|
}
|
|
return res, nil
|
|
case defaults.Owner != "" && defaults.Group != "":
|
|
owner, group = defaults.Owner, defaults.Group
|
|
default:
|
|
// No override, no existing file, no defaults — leave UID/GID
|
|
// at -1 so AtomicWriteFile skips the chown entirely.
|
|
return res, nil
|
|
}
|
|
|
|
uid, err := lookupUID(owner)
|
|
if err != nil {
|
|
return res, fmt.Errorf("resolve owner %q: %w", owner, err)
|
|
}
|
|
gid, err := lookupGID(group)
|
|
if err != nil {
|
|
return res, fmt.Errorf("resolve group %q: %w", group, err)
|
|
}
|
|
res.UID, res.GID = uid, gid
|
|
res.OwnerLabel, res.GroupLabel = owner, group
|
|
return res, nil
|
|
}
|
|
|
|
// applyOwnership applies the resolved (mode, uid, gid) to path.
|
|
// Both chown and chmod are best-effort: we attempt them, log
|
|
// warnings on failure, but do NOT fail the deploy. The agent runs
|
|
// as root in production; running as a regular user (CI / developer
|
|
// workstation) means chown to a different user fails with EPERM,
|
|
// which is expected and not actionable. The deploy semantically
|
|
// succeeded — only ownership lift was skipped.
|
|
//
|
|
// The "is this acceptable to silently swallow chown failure?"
|
|
// question is answered yes for two reasons:
|
|
// - In production (root agent), failures are real OS-level
|
|
// issues that show up in the audit log + Prometheus
|
|
// deploy_validate_failures_total counter.
|
|
// - In dev (non-root), failures are expected behavior; tests
|
|
// would otherwise need to be skipped or run with sudo.
|
|
//
|
|
// Connectors that NEED hard ownership enforcement (e.g. compliance
|
|
// audits) can wrap a stat-after-write check in their PostCommit.
|
|
func applyOwnership(path string, res resolvedOwnership) error {
|
|
if res.ModeSet {
|
|
if err := os.Chmod(path, res.Mode); err != nil {
|
|
return fmt.Errorf("chmod %s to %#o: %w", path, res.Mode, err)
|
|
}
|
|
}
|
|
if res.UID >= 0 && res.GID >= 0 {
|
|
if err := os.Chown(path, res.UID, res.GID); err != nil {
|
|
// In non-root contexts (dev, CI), chown to a
|
|
// different uid will fail with one of EPERM (most
|
|
// filesystems) or EINVAL (some tmpfs configs). The
|
|
// agent runs as root in production where chown
|
|
// will succeed; the dev-time failure is not an
|
|
// actionable signal and would otherwise force every
|
|
// test to run as root. We swallow the chown error
|
|
// when we're not root. Production agents (uid 0)
|
|
// still hard-fail on chown errors so genuine
|
|
// issues surface.
|
|
if runningAsRoot() {
|
|
return fmt.Errorf("chown %s to %d:%d: %w", path, res.UID, res.GID, err)
|
|
}
|
|
// Non-root chown failure: silently skip. The
|
|
// caller's audit log + Prometheus deploy-counter
|
|
// surface the "ownership lift requested but not
|
|
// granted" condition for production where it
|
|
// matters.
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// lookupUID resolves a username to a numeric uid. Accepts numeric
|
|
// strings ("1000") as a passthrough so the agent can accept either
|
|
// "nginx" or "1000" in operator config.
|
|
func lookupUID(username string) (int, error) {
|
|
if username == "" {
|
|
return -1, errors.New("empty username")
|
|
}
|
|
if uid, err := strconv.Atoi(username); err == nil {
|
|
return uid, nil
|
|
}
|
|
u, err := user.Lookup(username)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
uid, err := strconv.Atoi(u.Uid)
|
|
if err != nil {
|
|
return -1, fmt.Errorf("user %q has non-numeric uid %q: %w", username, u.Uid, err)
|
|
}
|
|
return uid, nil
|
|
}
|
|
|
|
// lookupGID resolves a group name to a numeric gid.
|
|
func lookupGID(groupname string) (int, error) {
|
|
if groupname == "" {
|
|
return -1, errors.New("empty groupname")
|
|
}
|
|
if gid, err := strconv.Atoi(groupname); err == nil {
|
|
return gid, nil
|
|
}
|
|
g, err := user.LookupGroup(groupname)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
gid, err := strconv.Atoi(g.Gid)
|
|
if err != nil {
|
|
return -1, fmt.Errorf("group %q has non-numeric gid %q: %w", groupname, g.Gid, err)
|
|
}
|
|
return gid, nil
|
|
}
|
|
|
|
// unixOwnerFromStat extracts (uid, gid) from a Unix-style FileInfo.
|
|
// On non-Unix platforms or when the underlying stat doesn't expose
|
|
// uid/gid, returns ok=false.
|
|
//
|
|
// Platform-specific implementations live in:
|
|
// - ownership_unix.go (//go:build unix — uses *syscall.Stat_t)
|
|
// - ownership_windows.go (//go:build windows — stub returns false)
|
|
//
|
|
// The split exists because syscall.Stat_t is Unix-only — Windows
|
|
// has no equivalent shape, so any production tsx that names it
|
|
// fails to compile on GOOS=windows. The cross-platform-build CI
|
|
// matrix caught this at Hotfix #16; the function was originally
|
|
// in this file pre-split.
|