mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-10 18:09:07 +00:00
febf50090b
Closes Bundle 3 of the 2026-05-02 deployment-target coverage audit (see cowork/deployment-target-audit-2026-05-02/RESULTS.md). The audit ranked this fix #3 by acquirer impact behind the K8s real client (#1) and the docs realignment (#2 / Bundle 1). Two production-grade gaps closed: 1. SDS JSON config write was non-atomic. Cert/key/chain at envoy.go L155/L168/L183 went through deploy.AtomicWriteFile (atomic + backups + ownership preservation), but the SDS JSON at L260 went through os.WriteFile directly. A power loss / OOM / process-kill mid-write of the SDS JSON produces a torn file Envoy cannot parse, and Envoy's file-based SDS watcher refuses to load any cert (not just the rotating one) until the JSON is repaired by hand. Replaced with deploy.AtomicWriteFile and threaded ctx through writeSDSConfig. 2. No watcher pickup confirmation before returning success. Pre-fix, DeployCertificate returned the moment file writes completed. Envoy's SDS watcher is asynchronous; a caller running post-deploy TLS verify immediately after DeployCertificate could see Envoy still serving the old cert (watcher latency, load-balanced replica hit one that hadn't reloaded yet). Added the canonical post-deploy verify pattern (mirrors nginx.go::runPostDeployVerify L416): probe seam + retry/backoff + SHA-256 fingerprint compare against request.CertPEM. On verify failure, restore from per-file backups via the new restoreFromBackups helper. Envoy has no PostCommit reload to re-run; the watcher auto-reloads on the restored files. Config additions to envoy.Config (mirror nginx.Config L84-93): - PostDeployVerify *PostDeployVerifyConfig (Enabled, Endpoint, Timeout) - PostDeployVerifyAttempts int (default 3 in runPostDeployVerify) - PostDeployVerifyBackoff time.Duration (default 2s) - BackupRetention int (mirrors nginx; passed to AtomicWriteFile per file) Default behaviour unchanged for callers that don't set PostDeployVerify — verify is opt-in. nil or Enabled=false skips it entirely. Probe seam: c.probe = tlsprobe.ProbeTLS at construction; tests inject via the new SetTestProbe method. Same shape NGINX uses (nginx.go:130); also mirrors the existing Traefik SetTestProbe at traefik.go:62. WriteResult retention: every AtomicWriteFile call now retains its *deploy.WriteResult in a local []*deploy.WriteResult slice so the rollback path can restore from BackupPath across all four files (cert, key, chain, SDS JSON), not just the cert. Pre-fix the cert's WriteResult was discarded. restoreFromBackups (envoy.go new): iterates the WriteResults from a successful per-file pass, rewrites each non-idempotent destination from its BackupPath via AtomicWriteFile{SkipIdempotent:true, BackupRetention:-1}. The -1 prevents backup-of-the-backup pollution. For files that didn't exist pre-deploy (BackupPath == ""), restore = remove. Mirrors nginx.go::rollbackToBackups (L487-515) with the reload step elided. Idempotency gate: shouldRunVerify returns true unless EVERY WriteResult was Idempotent — same all-files semantics NGINX gets from res.SkippedAsIdempotent. Pre-fix Envoy had no verify at all, so there was no gate to get wrong; this introduces the correct all-files shape from the start. Tests added to envoy_atomic_test.go: - TestEnvoy_Atomic_SDSConfigWriteIsAtomic — pre-writes a sentinel SDS JSON, runs DeployCertificate, asserts a backup file with deploy.BackupSuffix appears alongside the new sds.json (proves AtomicWriteFile is now in the SDS path). - TestEnvoy_Atomic_WatcherPickupRetries — stub probe returns wrong fingerprint on attempts 1+2 and correct on attempt 3; deploy succeeds; probe called exactly 3 times. - TestEnvoy_Atomic_WatcherPickupAllAttemptsFail_RollsBack — pre-writes SENTINEL bytes for cert+key, stub probe always wrong; deploy returns wrapped error AND the destination files contain the sentinel bytes (rollback restored). - TestEnvoy_Atomic_PostDeployVerifyDisabledByDefault — Config with nil PostDeployVerify; asserts probe is never called (opt-in default preserved). A small certPEMFingerprint helper added to the test file mirrors the production envoy.certPEMToFingerprint (which is package-private — external tests can't call it). docs/deployment-atomicity.md L87 row already documents "TLS handshake | atomic-write replaces os.WriteFile" — pre-fix the claim was aspirational (verify happened in the agent verify-and-report path, not the connector; SDS JSON wasn't atomic). Post-fix the claim is honest. No doc change required. Verified locally: - gofmt -l ./internal/connector/target/envoy/ clean - go vet ./internal/connector/target/envoy/... clean - staticcheck ./internal/connector/target/envoy/... clean - go build ./... clean - go test -race -count=1 ./internal/connector/target/envoy/... green (5 pre-existing tests + 4 new = 9 total) - go test -short -count=1 ./internal/connector/target/... green Audit reference: cowork/deployment-target-audit-2026-05-02/RESULTS.md Bundle 3.
334 lines
11 KiB
Go
334 lines
11 KiB
Go
package envoy_test
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/base64"
|
|
"encoding/hex"
|
|
"errors"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync/atomic"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/shankar0123/certctl/internal/connector/target"
|
|
"github.com/shankar0123/certctl/internal/connector/target/envoy"
|
|
"github.com/shankar0123/certctl/internal/deploy"
|
|
"github.com/shankar0123/certctl/internal/tlsprobe"
|
|
)
|
|
|
|
// Phase 7 of the deploy-hardening I master bundle: atomic-write
|
|
// retrofit for Envoy. Envoy file watcher (SDS) auto-reloads on
|
|
// rename, so the load-bearing change is the os.WriteFile ->
|
|
// deploy.AtomicWriteFile swap.
|
|
|
|
const certA = "-----BEGIN CERTIFICATE-----\nQUxQSEEtQ0VSVA==\n-----END CERTIFICATE-----\n"
|
|
const keyA = "-----BEGIN PRIVATE KEY-----\nZmFrZS1rZXk=\n-----END PRIVATE KEY-----\n"
|
|
|
|
func newTestLogger() *slog.Logger {
|
|
return slog.New(slog.NewTextHandler(os.NewFile(0, os.DevNull), &slog.HandlerOptions{Level: slog.LevelError}))
|
|
}
|
|
|
|
func TestEnvoy_Atomic_HappyPath(t *testing.T) {
|
|
dir := t.TempDir()
|
|
cfg := envoy.Config{CertDir: dir, CertFilename: "cert.pem", KeyFilename: "key.pem"}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{CertPEM: certA, KeyPEM: keyA})
|
|
if err != nil || !res.Success {
|
|
t.Fatal(err)
|
|
}
|
|
for _, p := range []string{filepath.Join(dir, "cert.pem"), filepath.Join(dir, "key.pem")} {
|
|
if _, err := os.Stat(p); err != nil {
|
|
t.Errorf("file missing: %s", p)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestEnvoy_Atomic_BackupCreated(t *testing.T) {
|
|
dir := t.TempDir()
|
|
cert := filepath.Join(dir, "cert.pem")
|
|
os.WriteFile(cert, []byte("OLD"), 0644)
|
|
cfg := envoy.Config{CertDir: dir, CertFilename: "cert.pem", KeyFilename: "key.pem"}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
c.DeployCertificate(context.Background(), target.DeploymentRequest{CertPEM: certA})
|
|
entries, _ := os.ReadDir(dir)
|
|
found := false
|
|
for _, e := range entries {
|
|
if strings.Contains(e.Name(), deploy.BackupSuffix) {
|
|
found = true
|
|
}
|
|
}
|
|
if !found {
|
|
t.Error("no backup created")
|
|
}
|
|
}
|
|
|
|
func TestEnvoy_Atomic_KeyMode_0600(t *testing.T) {
|
|
dir := t.TempDir()
|
|
cfg := envoy.Config{CertDir: dir, CertFilename: "cert.pem", KeyFilename: "key.pem"}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
c.DeployCertificate(context.Background(), target.DeploymentRequest{CertPEM: certA, KeyPEM: keyA})
|
|
stat, _ := os.Stat(filepath.Join(dir, "key.pem"))
|
|
if stat.Mode().Perm() != 0600 {
|
|
t.Errorf("key mode = %#o", stat.Mode().Perm())
|
|
}
|
|
}
|
|
|
|
func TestEnvoy_Atomic_Idempotency(t *testing.T) {
|
|
dir := t.TempDir()
|
|
cert := filepath.Join(dir, "cert.pem")
|
|
os.WriteFile(cert, []byte(certA+"\n"), 0644)
|
|
cfg := envoy.Config{CertDir: dir, CertFilename: "cert.pem", KeyFilename: "key.pem"}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
c.DeployCertificate(context.Background(), target.DeploymentRequest{CertPEM: certA})
|
|
entries, _ := os.ReadDir(dir)
|
|
for _, e := range entries {
|
|
if strings.Contains(e.Name(), deploy.BackupSuffix) {
|
|
t.Errorf("backup created on idempotent skip: %s", e.Name())
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestEnvoy_ValidateOnly_Sentinel(t *testing.T) {
|
|
cfg := envoy.Config{CertDir: t.TempDir(), CertFilename: "cert.pem", KeyFilename: "key.pem"}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
if err := c.ValidateOnly(context.Background(), target.DeploymentRequest{}); !errors.Is(err, target.ErrValidateOnlyNotSupported) {
|
|
t.Errorf("got %v", err)
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Bundle 3 (deployment-target audit 2026-05-02): SDS atomicity + post-deploy
|
|
// watcher pickup confirmation.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// certPEMFingerprint mirrors envoy.certPEMToFingerprint (which is package-
|
|
// private). Computes SHA-256 of the first PEM block's DER bytes; matches what
|
|
// tlsprobe.CertFingerprint emits for a served leaf cert.
|
|
func certPEMFingerprint(t *testing.T, pemBytes string) string {
|
|
t.Helper()
|
|
const begin = "-----BEGIN CERTIFICATE-----"
|
|
const end = "-----END CERTIFICATE-----"
|
|
bi := strings.Index(pemBytes, begin)
|
|
if bi < 0 {
|
|
t.Fatalf("no CERTIFICATE block in PEM")
|
|
}
|
|
rest := pemBytes[bi+len(begin):]
|
|
ei := strings.Index(rest, end)
|
|
if ei < 0 {
|
|
t.Fatalf("no END CERTIFICATE in PEM")
|
|
}
|
|
body := strings.TrimSpace(rest[:ei])
|
|
body = strings.ReplaceAll(body, "\n", "")
|
|
body = strings.ReplaceAll(body, "\r", "")
|
|
der, err := base64.StdEncoding.DecodeString(body)
|
|
if err != nil {
|
|
t.Fatalf("base64: %v", err)
|
|
}
|
|
h := sha256.Sum256(der)
|
|
return hex.EncodeToString(h[:])
|
|
}
|
|
|
|
// TestEnvoy_Atomic_SDSConfigWriteIsAtomic pins the wiring change at envoy.go's
|
|
// writeSDSConfig — pre-Bundle-3 the SDS JSON went through os.WriteFile (no
|
|
// backup, torn-write hazard). Post-fix it goes through deploy.AtomicWriteFile,
|
|
// which produces a sibling backup with deploy.BackupSuffix when an existing
|
|
// SDS JSON is replaced.
|
|
func TestEnvoy_Atomic_SDSConfigWriteIsAtomic(t *testing.T) {
|
|
dir := t.TempDir()
|
|
sdsPath := filepath.Join(dir, "sds.json")
|
|
// Pre-write a sentinel SDS JSON so the connector's write produces
|
|
// a backup we can assert on.
|
|
if err := os.WriteFile(sdsPath, []byte(`{"resources":[{"name":"old"}]}`), 0644); err != nil {
|
|
t.Fatalf("seed sds: %v", err)
|
|
}
|
|
cfg := envoy.Config{
|
|
CertDir: dir,
|
|
CertFilename: "cert.pem",
|
|
KeyFilename: "key.pem",
|
|
SDSConfig: true,
|
|
}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
|
CertPEM: certA,
|
|
KeyPEM: keyA,
|
|
})
|
|
if err != nil || !res.Success {
|
|
t.Fatalf("deploy: err=%v success=%v", err, res != nil && res.Success)
|
|
}
|
|
|
|
// SDS JSON should be the new bytes (i.e. NOT match the sentinel).
|
|
got, err := os.ReadFile(sdsPath)
|
|
if err != nil {
|
|
t.Fatalf("read sds: %v", err)
|
|
}
|
|
if strings.Contains(string(got), `"old"`) {
|
|
t.Errorf("SDS JSON not replaced; still contains sentinel")
|
|
}
|
|
if !strings.Contains(string(got), "server_cert") {
|
|
t.Errorf("SDS JSON missing expected resource name; got %s", string(got))
|
|
}
|
|
|
|
// AtomicWriteFile produces a backup file with deploy.BackupSuffix
|
|
// when replacing an existing destination. Pre-Bundle-3 (os.WriteFile
|
|
// path) no backup would exist for sds.json.
|
|
entries, _ := os.ReadDir(dir)
|
|
foundBak := false
|
|
for _, e := range entries {
|
|
if strings.HasPrefix(e.Name(), "sds.json"+deploy.BackupSuffix) {
|
|
foundBak = true
|
|
}
|
|
}
|
|
if !foundBak {
|
|
t.Errorf("no SDS JSON backup created — atomic-write wiring missing? entries=%v", entryNames(entries))
|
|
}
|
|
}
|
|
|
|
// TestEnvoy_Atomic_WatcherPickupRetries pins the retry/backoff loop in the
|
|
// post-deploy verify path. Stub the probe so attempts 1+2 return the wrong
|
|
// fingerprint and attempt 3 returns the correct one — DeployCertificate must
|
|
// succeed and the probe must have been called exactly 3 times.
|
|
func TestEnvoy_Atomic_WatcherPickupRetries(t *testing.T) {
|
|
dir := t.TempDir()
|
|
cfg := envoy.Config{
|
|
CertDir: dir,
|
|
CertFilename: "cert.pem",
|
|
KeyFilename: "key.pem",
|
|
PostDeployVerify: &envoy.PostDeployVerifyConfig{
|
|
Enabled: true,
|
|
Endpoint: "envoy.test.invalid:443",
|
|
Timeout: 100 * time.Millisecond,
|
|
},
|
|
PostDeployVerifyAttempts: 3,
|
|
PostDeployVerifyBackoff: time.Millisecond, // tight loop for tests
|
|
}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
|
|
want := certPEMFingerprint(t, certA)
|
|
var calls atomic.Int64
|
|
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
|
|
n := calls.Add(1)
|
|
if n < 3 {
|
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"}
|
|
}
|
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: want}
|
|
})
|
|
|
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
|
CertPEM: certA,
|
|
KeyPEM: keyA,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("deploy returned error after retries should have succeeded: %v", err)
|
|
}
|
|
if !res.Success {
|
|
t.Fatalf("deploy.Success=false; message=%s", res.Message)
|
|
}
|
|
if got := calls.Load(); got != 3 {
|
|
t.Errorf("probe called %d times, want 3", got)
|
|
}
|
|
}
|
|
|
|
// TestEnvoy_Atomic_WatcherPickupAllAttemptsFail_RollsBack pins the verify-
|
|
// failure rollback path. Pre-write sentinel cert + key; stub probe to always
|
|
// return the wrong fingerprint; assert DeployCertificate returns a wrapped
|
|
// error AND the destination files contain the sentinel bytes (restored from
|
|
// backups).
|
|
func TestEnvoy_Atomic_WatcherPickupAllAttemptsFail_RollsBack(t *testing.T) {
|
|
dir := t.TempDir()
|
|
certPath := filepath.Join(dir, "cert.pem")
|
|
keyPath := filepath.Join(dir, "key.pem")
|
|
sentCert := []byte("SENTINEL-CERT-BYTES")
|
|
sentKey := []byte("SENTINEL-KEY-BYTES")
|
|
if err := os.WriteFile(certPath, sentCert, 0644); err != nil {
|
|
t.Fatalf("seed cert: %v", err)
|
|
}
|
|
if err := os.WriteFile(keyPath, sentKey, 0600); err != nil {
|
|
t.Fatalf("seed key: %v", err)
|
|
}
|
|
|
|
cfg := envoy.Config{
|
|
CertDir: dir,
|
|
CertFilename: "cert.pem",
|
|
KeyFilename: "key.pem",
|
|
PostDeployVerify: &envoy.PostDeployVerifyConfig{
|
|
Enabled: true,
|
|
Endpoint: "envoy.test.invalid:443",
|
|
Timeout: 100 * time.Millisecond,
|
|
},
|
|
PostDeployVerifyAttempts: 2,
|
|
PostDeployVerifyBackoff: time.Millisecond,
|
|
}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
|
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"}
|
|
})
|
|
|
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
|
CertPEM: certA,
|
|
KeyPEM: keyA,
|
|
})
|
|
if err == nil {
|
|
t.Fatalf("expected verify-mismatch error, got nil; res=%+v", res)
|
|
}
|
|
if res.Success {
|
|
t.Errorf("expected Success=false on verify failure")
|
|
}
|
|
if !strings.Contains(strings.ToLower(res.Message), "verify") {
|
|
t.Errorf("expected message to mention verify; got %q", res.Message)
|
|
}
|
|
|
|
// Both files must be restored to sentinel bytes.
|
|
gotCert, _ := os.ReadFile(certPath)
|
|
if string(gotCert) != string(sentCert) {
|
|
t.Errorf("cert not restored on rollback; got %q want %q", string(gotCert), string(sentCert))
|
|
}
|
|
gotKey, _ := os.ReadFile(keyPath)
|
|
if string(gotKey) != string(sentKey) {
|
|
t.Errorf("key not restored on rollback; got %q want %q", string(gotKey), string(sentKey))
|
|
}
|
|
}
|
|
|
|
// TestEnvoy_Atomic_PostDeployVerifyDisabledByDefault pins the opt-in default.
|
|
// A Config with no PostDeployVerify set must NOT call the probe — preserving
|
|
// pre-Bundle-3 behaviour for callers that don't opt in.
|
|
func TestEnvoy_Atomic_PostDeployVerifyDisabledByDefault(t *testing.T) {
|
|
dir := t.TempDir()
|
|
cfg := envoy.Config{
|
|
CertDir: dir,
|
|
CertFilename: "cert.pem",
|
|
KeyFilename: "key.pem",
|
|
// PostDeployVerify intentionally nil.
|
|
}
|
|
c := envoy.New(&cfg, newTestLogger())
|
|
var calls atomic.Int64
|
|
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
|
|
calls.Add(1)
|
|
return tlsprobe.ProbeResult{Success: false, Error: "probe should not be called"}
|
|
})
|
|
|
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
|
CertPEM: certA,
|
|
KeyPEM: keyA,
|
|
})
|
|
if err != nil || !res.Success {
|
|
t.Fatalf("deploy: err=%v success=%v", err, res != nil && res.Success)
|
|
}
|
|
if got := calls.Load(); got != 0 {
|
|
t.Errorf("probe called %d times when PostDeployVerify is nil; want 0", got)
|
|
}
|
|
}
|
|
|
|
// entryNames is a tiny helper for log-friendly directory listings in test
|
|
// failure messages.
|
|
func entryNames(entries []os.DirEntry) []string {
|
|
names := make([]string, 0, len(entries))
|
|
for _, e := range entries {
|
|
names = append(names, e.Name())
|
|
}
|
|
return names
|
|
}
|