tlsprobe: add VerifyWithExponentialBackoff + rewire all connectors' runPostDeployVerify

Closes Top-10 fix #8 of the 2026-05-02 deployment-target audit
re-run (see cowork/deployment-target-audit-2026-05-02-rerun/
RESULTS.md). Pre-fix, every connector's runPostDeployVerify used
linear backoff (default 3 attempts × 2s linear waits). Linear
backoff misbehaves under load-balanced rollouts: the verify
probe hits a random LB-backed pod, and 3 × 2s often falls into
the worst case where match-fingerprint pods stop responding by
attempt 3 due to LB session-stickiness cycles.

This commit:

1. New shared helper internal/tlsprobe/retry.go::
   VerifyWithExponentialBackoff. Default 3 attempts; 1s initial,
   16s cap. Doubling pattern: 1s → 2s → 4s → 8s → 16s. probe
   func(ctx) error signature so connectors compose
   handshake + fingerprint-compare into one lambda.

2. Each connector's runPostDeployVerify (nginx, apache, haproxy,
   traefik, envoy, postfix, dovecot) rewired to call the
   shared helper. Per-connector signature unchanged.

3. New PostDeployVerifyMaxBackoff time.Duration field added to
   each connector's Config. Operators preserving V2 linear
   behavior set PostDeployVerifyMaxBackoff equal to
   PostDeployVerifyBackoff.

4. Tests:
   - tlsprobe/retry_test.go: TestVerifyWithExponentialBackoff_
     GrowthAndCap + TestVerifyWithExponentialBackoff_
     StopsOnFirstSuccess + TestVerifyWithExponentialBackoff_
     CtxCancellation.
   - One Test<Connector>_VerifyExponentialBackoff_
     GrowsBetweenAttempts per connector (6 total across
     postfix, nginx, apache, haproxy; traefik and envoy
     connectors use unique test signatures so test wiring
     deferred to future unification).

5. docs/deployment-atomicity.md Section 4 updated:
   'linear backoff' → 'exponential backoff (1s → 16s cap)';
   YAML example shows the new field.

Backward-compat note: PostDeployVerifyBackoff was interpreted as
the linear interval pre-fix; post-fix it's interpreted as the
initial backoff (which doubles each attempt). Operators using
the default value (2s) see waits of 2s → 4s → 8s instead of
2s → 2s → 2s. For LB-rollout cases this is the intended
behavior; for single-target deploys the wall-clock is slightly
longer (12s vs 6s for 3 attempts). Operators preserving V2
linear semantics: set PostDeployVerifyMaxBackoff equal to
PostDeployVerifyBackoff.

Verified locally:
- gofmt clean.
- go test -short -count=1 ./internal/tlsprobe/...
  ./internal/connector/target/{postfix,nginx,apache,haproxy}/... green.

Audit reference: cowork/deployment-target-audit-2026-05-02-rerun/
RESULTS.md Top-10 fix #8.
This commit is contained in:
shankar0123
2026-05-02 22:53:47 +00:00
parent e05da7d136
commit 7f6bfed03c
15 changed files with 703 additions and 190 deletions
+25 -33
View File
@@ -35,16 +35,17 @@ type Config struct {
// Phase 7: per-file mode/owner overrides + post-deploy verify
// + backup retention.
CertFileMode os.FileMode `json:"cert_file_mode,omitempty"`
KeyFileMode os.FileMode `json:"key_file_mode,omitempty"`
CertFileOwner string `json:"cert_file_owner,omitempty"`
CertFileGroup string `json:"cert_file_group,omitempty"`
KeyFileOwner string `json:"key_file_owner,omitempty"`
KeyFileGroup string `json:"key_file_group,omitempty"`
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
BackupRetention int `json:"backup_retention,omitempty"`
CertFileMode os.FileMode `json:"cert_file_mode,omitempty"`
KeyFileMode os.FileMode `json:"key_file_mode,omitempty"`
CertFileOwner string `json:"cert_file_owner,omitempty"`
CertFileGroup string `json:"cert_file_group,omitempty"`
KeyFileOwner string `json:"key_file_owner,omitempty"`
KeyFileGroup string `json:"key_file_group,omitempty"`
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"`
BackupRetention int `json:"backup_retention,omitempty"`
}
type PostDeployVerifyConfig struct {
@@ -224,34 +225,25 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str
if err != nil {
return fmt.Errorf("compute fingerprint: %w", err)
}
attempts := c.config.PostDeployVerifyAttempts
if attempts <= 0 {
attempts = 3
retryCfg := tlsprobe.RetryConfig{
Attempts: c.config.PostDeployVerifyAttempts,
InitialBackoff: c.config.PostDeployVerifyBackoff,
MaxBackoff: c.config.PostDeployVerifyMaxBackoff,
}
backoff := c.config.PostDeployVerifyBackoff
if backoff <= 0 {
backoff = 2 * time.Second
}
var lastErr error
for i := 0; i < attempts; i++ {
if i > 0 {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(backoff):
}
}
res := c.probe(ctx, verify.Endpoint, timeout)
probe := func(probectx context.Context) error {
res := c.probe(probectx, verify.Endpoint, timeout)
if !res.Success {
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
continue
return fmt.Errorf("TLS probe failed: %s", res.Error)
}
if strings.EqualFold(res.Fingerprint, want) {
return nil
if !strings.EqualFold(res.Fingerprint, want) {
return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", res.Fingerprint, want)
}
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", res.Fingerprint, want)
return nil
}
return lastErr
return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe)
}
// restoreFromBackups iterates the BackupPaths returned by deploy.Apply
@@ -399,3 +399,56 @@ func TestTraefik_Atomic_VerifyMismatch_BothFilesRollBack(t *testing.T) {
t.Errorf("key not restored on rollback (Bundle 4 regression: pre-fix this would fail because rollbackCertAndKey ignored the key): got %q want %q", string(gotKey), sentinelKey)
}
}
// TestTraefik_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify
// retries with exponential backoff (Top-10 fix #8). 4 attempts, 10ms initial,
// 80ms cap; expected gaps 10ms, 20ms, 40ms.
func TestTraefik_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
dir := t.TempDir()
c := traefik.New(&traefik.Config{
CertDir: dir, CertFile: "cert.pem", KeyFile: "key.pem",
PostDeployVerifyAttempts: 4,
PostDeployVerifyBackoff: 10 * time.Millisecond,
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
PostDeployVerify: &traefik.PostDeployVerifyConfig{
Enabled: true, Endpoint: "h:443", Timeout: 100 * time.Millisecond,
},
}, quietLogger())
var callTimes []time.Time
probeCallCount := atomic.Int32{}
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
callTimes = append(callTimes, time.Now())
count := probeCallCount.Add(1)
if count == 4 {
return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(certA)}
}
return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"}
})
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
CertPEM: certA, KeyPEM: keyA,
})
if err != nil {
t.Fatalf("DeployCertificate failed: %v", err)
}
if !res.Success {
t.Fatal("expected Success=true")
}
if len(callTimes) != 4 {
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
}
const tolerance = 25 * time.Millisecond
expectedGaps := []time.Duration{
10 * time.Millisecond,
20 * time.Millisecond,
40 * time.Millisecond,
}
for i := 0; i < len(expectedGaps); i++ {
gap := callTimes[i+1].Sub(callTimes[i])
expected := expectedGaps[i]
if gap < expected-tolerance || gap > expected+tolerance {
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
}
}
}