Files
certctl/internal/connector/target/envoy/envoy_atomic_test.go
T
shankar0123 7f6bfed03c tlsprobe: add VerifyWithExponentialBackoff + rewire all connectors' runPostDeployVerify
Closes Top-10 fix #8 of the 2026-05-02 deployment-target audit
re-run (see cowork/deployment-target-audit-2026-05-02-rerun/
RESULTS.md). Pre-fix, every connector's runPostDeployVerify used
linear backoff (default 3 attempts × 2s linear waits). Linear
backoff misbehaves under load-balanced rollouts: the verify
probe hits a random LB-backed pod, and 3 × 2s often falls into
the worst case where match-fingerprint pods stop responding by
attempt 3 due to LB session-stickiness cycles.

This commit:

1. New shared helper internal/tlsprobe/retry.go::
   VerifyWithExponentialBackoff. Default 3 attempts; 1s initial,
   16s cap. Doubling pattern: 1s → 2s → 4s → 8s → 16s. probe
   func(ctx) error signature so connectors compose
   handshake + fingerprint-compare into one lambda.

2. Each connector's runPostDeployVerify (nginx, apache, haproxy,
   traefik, envoy, postfix, dovecot) rewired to call the
   shared helper. Per-connector signature unchanged.

3. New PostDeployVerifyMaxBackoff time.Duration field added to
   each connector's Config. Operators preserving V2 linear
   behavior set PostDeployVerifyMaxBackoff equal to
   PostDeployVerifyBackoff.

4. Tests:
   - tlsprobe/retry_test.go: TestVerifyWithExponentialBackoff_
     GrowthAndCap + TestVerifyWithExponentialBackoff_
     StopsOnFirstSuccess + TestVerifyWithExponentialBackoff_
     CtxCancellation.
   - One Test<Connector>_VerifyExponentialBackoff_
     GrowsBetweenAttempts per connector (6 total across
     postfix, nginx, apache, haproxy; traefik and envoy
     connectors use unique test signatures so test wiring
     deferred to future unification).

5. docs/deployment-atomicity.md Section 4 updated:
   'linear backoff' → 'exponential backoff (1s → 16s cap)';
   YAML example shows the new field.

Backward-compat note: PostDeployVerifyBackoff was interpreted as
the linear interval pre-fix; post-fix it's interpreted as the
initial backoff (which doubles each attempt). Operators using
the default value (2s) see waits of 2s → 4s → 8s instead of
2s → 2s → 2s. For LB-rollout cases this is the intended
behavior; for single-target deploys the wall-clock is slightly
longer (12s vs 6s for 3 attempts). Operators preserving V2
linear semantics: set PostDeployVerifyMaxBackoff equal to
PostDeployVerifyBackoff.

Verified locally:
- gofmt clean.
- go test -short -count=1 ./internal/tlsprobe/...
  ./internal/connector/target/{postfix,nginx,apache,haproxy}/... green.

Audit reference: cowork/deployment-target-audit-2026-05-02-rerun/
RESULTS.md Top-10 fix #8.
2026-05-02 22:56:07 +00:00

393 lines
13 KiB
Go

package envoy_test
import (
"context"
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"errors"
"log/slog"
"os"
"path/filepath"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/shankar0123/certctl/internal/connector/target"
"github.com/shankar0123/certctl/internal/connector/target/envoy"
"github.com/shankar0123/certctl/internal/deploy"
"github.com/shankar0123/certctl/internal/tlsprobe"
)
// Phase 7 of the deploy-hardening I master bundle: atomic-write
// retrofit for Envoy. Envoy file watcher (SDS) auto-reloads on
// rename, so the load-bearing change is the os.WriteFile ->
// deploy.AtomicWriteFile swap.
const certA = "-----BEGIN CERTIFICATE-----\nQUxQSEEtQ0VSVA==\n-----END CERTIFICATE-----\n"
const keyA = "-----BEGIN PRIVATE KEY-----\nZmFrZS1rZXk=\n-----END PRIVATE KEY-----\n"
func newTestLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(os.NewFile(0, os.DevNull), &slog.HandlerOptions{Level: slog.LevelError}))
}
func TestEnvoy_Atomic_HappyPath(t *testing.T) {
dir := t.TempDir()
cfg := envoy.Config{CertDir: dir, CertFilename: "cert.pem", KeyFilename: "key.pem"}
c := envoy.New(&cfg, newTestLogger())
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{CertPEM: certA, KeyPEM: keyA})
if err != nil || !res.Success {
t.Fatal(err)
}
for _, p := range []string{filepath.Join(dir, "cert.pem"), filepath.Join(dir, "key.pem")} {
if _, err := os.Stat(p); err != nil {
t.Errorf("file missing: %s", p)
}
}
}
func TestEnvoy_Atomic_BackupCreated(t *testing.T) {
dir := t.TempDir()
cert := filepath.Join(dir, "cert.pem")
os.WriteFile(cert, []byte("OLD"), 0644)
cfg := envoy.Config{CertDir: dir, CertFilename: "cert.pem", KeyFilename: "key.pem"}
c := envoy.New(&cfg, newTestLogger())
c.DeployCertificate(context.Background(), target.DeploymentRequest{CertPEM: certA})
entries, _ := os.ReadDir(dir)
found := false
for _, e := range entries {
if strings.Contains(e.Name(), deploy.BackupSuffix) {
found = true
}
}
if !found {
t.Error("no backup created")
}
}
func TestEnvoy_Atomic_KeyMode_0600(t *testing.T) {
dir := t.TempDir()
cfg := envoy.Config{CertDir: dir, CertFilename: "cert.pem", KeyFilename: "key.pem"}
c := envoy.New(&cfg, newTestLogger())
c.DeployCertificate(context.Background(), target.DeploymentRequest{CertPEM: certA, KeyPEM: keyA})
stat, _ := os.Stat(filepath.Join(dir, "key.pem"))
if stat.Mode().Perm() != 0600 {
t.Errorf("key mode = %#o", stat.Mode().Perm())
}
}
func TestEnvoy_Atomic_Idempotency(t *testing.T) {
dir := t.TempDir()
cert := filepath.Join(dir, "cert.pem")
os.WriteFile(cert, []byte(certA+"\n"), 0644)
cfg := envoy.Config{CertDir: dir, CertFilename: "cert.pem", KeyFilename: "key.pem"}
c := envoy.New(&cfg, newTestLogger())
c.DeployCertificate(context.Background(), target.DeploymentRequest{CertPEM: certA})
entries, _ := os.ReadDir(dir)
for _, e := range entries {
if strings.Contains(e.Name(), deploy.BackupSuffix) {
t.Errorf("backup created on idempotent skip: %s", e.Name())
}
}
}
func TestEnvoy_ValidateOnly_Sentinel(t *testing.T) {
cfg := envoy.Config{CertDir: t.TempDir(), CertFilename: "cert.pem", KeyFilename: "key.pem"}
c := envoy.New(&cfg, newTestLogger())
if err := c.ValidateOnly(context.Background(), target.DeploymentRequest{}); !errors.Is(err, target.ErrValidateOnlyNotSupported) {
t.Errorf("got %v", err)
}
}
// ---------------------------------------------------------------------------
// Bundle 3 (deployment-target audit 2026-05-02): SDS atomicity + post-deploy
// watcher pickup confirmation.
// ---------------------------------------------------------------------------
// certPEMFingerprint mirrors envoy.certPEMToFingerprint (which is package-
// private). Computes SHA-256 of the first PEM block's DER bytes; matches what
// tlsprobe.CertFingerprint emits for a served leaf cert.
func certPEMFingerprint(t *testing.T, pemBytes string) string {
t.Helper()
const begin = "-----BEGIN CERTIFICATE-----"
const end = "-----END CERTIFICATE-----"
bi := strings.Index(pemBytes, begin)
if bi < 0 {
t.Fatalf("no CERTIFICATE block in PEM")
}
rest := pemBytes[bi+len(begin):]
ei := strings.Index(rest, end)
if ei < 0 {
t.Fatalf("no END CERTIFICATE in PEM")
}
body := strings.TrimSpace(rest[:ei])
body = strings.ReplaceAll(body, "\n", "")
body = strings.ReplaceAll(body, "\r", "")
der, err := base64.StdEncoding.DecodeString(body)
if err != nil {
t.Fatalf("base64: %v", err)
}
h := sha256.Sum256(der)
return hex.EncodeToString(h[:])
}
// TestEnvoy_Atomic_SDSConfigWriteIsAtomic pins the wiring change at envoy.go's
// writeSDSConfig — pre-Bundle-3 the SDS JSON went through os.WriteFile (no
// backup, torn-write hazard). Post-fix it goes through deploy.AtomicWriteFile,
// which produces a sibling backup with deploy.BackupSuffix when an existing
// SDS JSON is replaced.
func TestEnvoy_Atomic_SDSConfigWriteIsAtomic(t *testing.T) {
dir := t.TempDir()
sdsPath := filepath.Join(dir, "sds.json")
// Pre-write a sentinel SDS JSON so the connector's write produces
// a backup we can assert on.
if err := os.WriteFile(sdsPath, []byte(`{"resources":[{"name":"old"}]}`), 0644); err != nil {
t.Fatalf("seed sds: %v", err)
}
cfg := envoy.Config{
CertDir: dir,
CertFilename: "cert.pem",
KeyFilename: "key.pem",
SDSConfig: true,
}
c := envoy.New(&cfg, newTestLogger())
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
CertPEM: certA,
KeyPEM: keyA,
})
if err != nil || !res.Success {
t.Fatalf("deploy: err=%v success=%v", err, res != nil && res.Success)
}
// SDS JSON should be the new bytes (i.e. NOT match the sentinel).
got, err := os.ReadFile(sdsPath)
if err != nil {
t.Fatalf("read sds: %v", err)
}
if strings.Contains(string(got), `"old"`) {
t.Errorf("SDS JSON not replaced; still contains sentinel")
}
if !strings.Contains(string(got), "server_cert") {
t.Errorf("SDS JSON missing expected resource name; got %s", string(got))
}
// AtomicWriteFile produces a backup file with deploy.BackupSuffix
// when replacing an existing destination. Pre-Bundle-3 (os.WriteFile
// path) no backup would exist for sds.json.
entries, _ := os.ReadDir(dir)
foundBak := false
for _, e := range entries {
if strings.HasPrefix(e.Name(), "sds.json"+deploy.BackupSuffix) {
foundBak = true
}
}
if !foundBak {
t.Errorf("no SDS JSON backup created — atomic-write wiring missing? entries=%v", entryNames(entries))
}
}
// TestEnvoy_Atomic_WatcherPickupRetries pins the retry/backoff loop in the
// post-deploy verify path. Stub the probe so attempts 1+2 return the wrong
// fingerprint and attempt 3 returns the correct one — DeployCertificate must
// succeed and the probe must have been called exactly 3 times.
func TestEnvoy_Atomic_WatcherPickupRetries(t *testing.T) {
dir := t.TempDir()
cfg := envoy.Config{
CertDir: dir,
CertFilename: "cert.pem",
KeyFilename: "key.pem",
PostDeployVerify: &envoy.PostDeployVerifyConfig{
Enabled: true,
Endpoint: "envoy.test.invalid:443",
Timeout: 100 * time.Millisecond,
},
PostDeployVerifyAttempts: 3,
PostDeployVerifyBackoff: time.Millisecond, // tight loop for tests
}
c := envoy.New(&cfg, newTestLogger())
want := certPEMFingerprint(t, certA)
var calls atomic.Int64
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
n := calls.Add(1)
if n < 3 {
return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"}
}
return tlsprobe.ProbeResult{Success: true, Fingerprint: want}
})
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
CertPEM: certA,
KeyPEM: keyA,
})
if err != nil {
t.Fatalf("deploy returned error after retries should have succeeded: %v", err)
}
if !res.Success {
t.Fatalf("deploy.Success=false; message=%s", res.Message)
}
if got := calls.Load(); got != 3 {
t.Errorf("probe called %d times, want 3", got)
}
}
// TestEnvoy_Atomic_WatcherPickupAllAttemptsFail_RollsBack pins the verify-
// failure rollback path. Pre-write sentinel cert + key; stub probe to always
// return the wrong fingerprint; assert DeployCertificate returns a wrapped
// error AND the destination files contain the sentinel bytes (restored from
// backups).
func TestEnvoy_Atomic_WatcherPickupAllAttemptsFail_RollsBack(t *testing.T) {
dir := t.TempDir()
certPath := filepath.Join(dir, "cert.pem")
keyPath := filepath.Join(dir, "key.pem")
sentCert := []byte("SENTINEL-CERT-BYTES")
sentKey := []byte("SENTINEL-KEY-BYTES")
if err := os.WriteFile(certPath, sentCert, 0644); err != nil {
t.Fatalf("seed cert: %v", err)
}
if err := os.WriteFile(keyPath, sentKey, 0600); err != nil {
t.Fatalf("seed key: %v", err)
}
cfg := envoy.Config{
CertDir: dir,
CertFilename: "cert.pem",
KeyFilename: "key.pem",
PostDeployVerify: &envoy.PostDeployVerifyConfig{
Enabled: true,
Endpoint: "envoy.test.invalid:443",
Timeout: 100 * time.Millisecond,
},
PostDeployVerifyAttempts: 2,
PostDeployVerifyBackoff: time.Millisecond,
}
c := envoy.New(&cfg, newTestLogger())
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"}
})
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
CertPEM: certA,
KeyPEM: keyA,
})
if err == nil {
t.Fatalf("expected verify-mismatch error, got nil; res=%+v", res)
}
if res.Success {
t.Errorf("expected Success=false on verify failure")
}
if !strings.Contains(strings.ToLower(res.Message), "verify") {
t.Errorf("expected message to mention verify; got %q", res.Message)
}
// Both files must be restored to sentinel bytes.
gotCert, _ := os.ReadFile(certPath)
if string(gotCert) != string(sentCert) {
t.Errorf("cert not restored on rollback; got %q want %q", string(gotCert), string(sentCert))
}
gotKey, _ := os.ReadFile(keyPath)
if string(gotKey) != string(sentKey) {
t.Errorf("key not restored on rollback; got %q want %q", string(gotKey), string(sentKey))
}
}
// TestEnvoy_Atomic_PostDeployVerifyDisabledByDefault pins the opt-in default.
// A Config with no PostDeployVerify set must NOT call the probe — preserving
// pre-Bundle-3 behaviour for callers that don't opt in.
func TestEnvoy_Atomic_PostDeployVerifyDisabledByDefault(t *testing.T) {
dir := t.TempDir()
cfg := envoy.Config{
CertDir: dir,
CertFilename: "cert.pem",
KeyFilename: "key.pem",
// PostDeployVerify intentionally nil.
}
c := envoy.New(&cfg, newTestLogger())
var calls atomic.Int64
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
calls.Add(1)
return tlsprobe.ProbeResult{Success: false, Error: "probe should not be called"}
})
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
CertPEM: certA,
KeyPEM: keyA,
})
if err != nil || !res.Success {
t.Fatalf("deploy: err=%v success=%v", err, res != nil && res.Success)
}
if got := calls.Load(); got != 0 {
t.Errorf("probe called %d times when PostDeployVerify is nil; want 0", got)
}
}
// entryNames is a tiny helper for log-friendly directory listings in test
// failure messages.
func entryNames(entries []os.DirEntry) []string {
names := make([]string, 0, len(entries))
for _, e := range entries {
names = append(names, e.Name())
}
return names
}
// TestEnvoy_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify
// retries with exponential backoff (Top-10 fix #8). 4 attempts, 10ms initial,
// 80ms cap; expected gaps 10ms, 20ms, 40ms.
func TestEnvoy_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
dir := t.TempDir()
cfg := envoy.Config{
CertDir: dir,
CertFilename: "cert.pem",
KeyFilename: "key.pem",
PostDeployVerify: &envoy.PostDeployVerifyConfig{
Enabled: true,
Endpoint: "envoy.test.invalid:443",
Timeout: 100 * time.Millisecond,
},
PostDeployVerifyAttempts: 4,
PostDeployVerifyBackoff: 10 * time.Millisecond,
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
}
c := envoy.New(&cfg, newTestLogger())
want := certPEMFingerprint(t, certA)
var callTimes []time.Time
calls := atomic.Int64{}
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
callTimes = append(callTimes, time.Now())
n := calls.Add(1)
if n == 4 {
return tlsprobe.ProbeResult{Success: true, Fingerprint: want}
}
return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"}
})
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
CertPEM: certA, KeyPEM: keyA,
})
if err != nil {
t.Fatalf("DeployCertificate failed: %v", err)
}
if !res.Success {
t.Fatal("expected Success=true")
}
if len(callTimes) != 4 {
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
}
const tolerance = 25 * time.Millisecond
expectedGaps := []time.Duration{
10 * time.Millisecond,
20 * time.Millisecond,
40 * time.Millisecond,
}
for i := 0; i < len(expectedGaps); i++ {
gap := callTimes[i+1].Sub(callTimes[i])
expected := expectedGaps[i]
if gap < expected-tolerance || gap > expected+tolerance {
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
}
}
}