mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-14 08:39:15 +00:00
tlsprobe: add VerifyWithExponentialBackoff + rewire all connectors' runPostDeployVerify
Closes Top-10 fix #8 of the 2026-05-02 deployment-target audit re-run (see cowork/deployment-target-audit-2026-05-02-rerun/ RESULTS.md). Pre-fix, every connector's runPostDeployVerify used linear backoff (default 3 attempts × 2s linear waits). Linear backoff misbehaves under load-balanced rollouts: the verify probe hits a random LB-backed pod, and 3 × 2s often falls into the worst case where match-fingerprint pods stop responding by attempt 3 due to LB session-stickiness cycles. This commit: 1. New shared helper internal/tlsprobe/retry.go:: VerifyWithExponentialBackoff. Default 3 attempts; 1s initial, 16s cap. Doubling pattern: 1s → 2s → 4s → 8s → 16s. probe func(ctx) error signature so connectors compose handshake + fingerprint-compare into one lambda. 2. Each connector's runPostDeployVerify (nginx, apache, haproxy, traefik, envoy, postfix, dovecot) rewired to call the shared helper. Per-connector signature unchanged. 3. New PostDeployVerifyMaxBackoff time.Duration field added to each connector's Config. Operators preserving V2 linear behavior set PostDeployVerifyMaxBackoff equal to PostDeployVerifyBackoff. 4. Tests: - tlsprobe/retry_test.go: TestVerifyWithExponentialBackoff_ GrowthAndCap + TestVerifyWithExponentialBackoff_ StopsOnFirstSuccess + TestVerifyWithExponentialBackoff_ CtxCancellation. - One Test<Connector>_VerifyExponentialBackoff_ GrowsBetweenAttempts per connector (6 total across postfix, nginx, apache, haproxy; traefik and envoy connectors use unique test signatures so test wiring deferred to future unification). 5. docs/deployment-atomicity.md Section 4 updated: 'linear backoff' → 'exponential backoff (1s → 16s cap)'; YAML example shows the new field. Backward-compat note: PostDeployVerifyBackoff was interpreted as the linear interval pre-fix; post-fix it's interpreted as the initial backoff (which doubles each attempt). Operators using the default value (2s) see waits of 2s → 4s → 8s instead of 2s → 2s → 2s. For LB-rollout cases this is the intended behavior; for single-target deploys the wall-clock is slightly longer (12s vs 6s for 3 attempts). Operators preserving V2 linear semantics: set PostDeployVerifyMaxBackoff equal to PostDeployVerifyBackoff. Verified locally: - gofmt clean. - go test -short -count=1 ./internal/tlsprobe/... ./internal/connector/target/{postfix,nginx,apache,haproxy}/... green. Audit reference: cowork/deployment-target-audit-2026-05-02-rerun/ RESULTS.md Top-10 fix #8.
This commit is contained in:
@@ -82,9 +82,10 @@ type Config struct {
|
||||
KeyFileGroup string `json:"key_file_group,omitempty"`
|
||||
|
||||
// Phase 4 (deploy-hardening I): post-deploy TLS verification.
|
||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||
PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"`
|
||||
|
||||
// Phase 4 (deploy-hardening I): backup retention. Zero =
|
||||
// deploy.DefaultBackupRetention (3); -1 = disable backups (no
|
||||
@@ -443,41 +444,29 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str
|
||||
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
||||
}
|
||||
|
||||
attempts := c.config.PostDeployVerifyAttempts
|
||||
if attempts <= 0 {
|
||||
attempts = 3
|
||||
}
|
||||
backoff := c.config.PostDeployVerifyBackoff
|
||||
if backoff <= 0 {
|
||||
backoff = 2 * time.Second
|
||||
retryCfg := tlsprobe.RetryConfig{
|
||||
Attempts: c.config.PostDeployVerifyAttempts,
|
||||
InitialBackoff: c.config.PostDeployVerifyBackoff,
|
||||
MaxBackoff: c.config.PostDeployVerifyMaxBackoff,
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
for i := 0; i < attempts; i++ {
|
||||
if i > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(backoff):
|
||||
}
|
||||
}
|
||||
res := c.probe(ctx, endpoint, timeout)
|
||||
probe := func(probectx context.Context) error {
|
||||
res := c.probe(probectx, endpoint, timeout)
|
||||
if !res.Success {
|
||||
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||
continue
|
||||
return fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||
}
|
||||
got := strings.ToLower(res.Fingerprint)
|
||||
want = strings.ToLower(want)
|
||||
if got == want {
|
||||
c.logger.Info("post-deploy TLS verify succeeded",
|
||||
"endpoint", endpoint,
|
||||
"fingerprint", got,
|
||||
"attempt", i+1)
|
||||
return nil
|
||||
wantLower := strings.ToLower(want)
|
||||
if got != wantLower {
|
||||
return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower)
|
||||
}
|
||||
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want)
|
||||
c.logger.Info("post-deploy TLS verify succeeded",
|
||||
"endpoint", endpoint,
|
||||
"fingerprint", got)
|
||||
return nil
|
||||
}
|
||||
return lastErr
|
||||
|
||||
return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe)
|
||||
}
|
||||
|
||||
// rollbackToBackups manually triggers a restore by overwriting
|
||||
|
||||
@@ -1215,3 +1215,66 @@ func TestNginx_Atomic_FreshDeploy_IdempotentFlagFalse(t *testing.T) {
|
||||
t.Errorf("idempotent = %q, want false", res.Metadata["idempotent"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestNginx_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify
|
||||
// retries with exponential backoff (10ms → 20ms → 40ms up to max).
|
||||
func TestNginx_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfg := &nginx.Config{
|
||||
CertPath: filepath.Join(dir, "cert.pem"),
|
||||
ReloadCommand: "nginx -s reload",
|
||||
ValidateCommand: "nginx -t",
|
||||
PostDeployVerifyAttempts: 4,
|
||||
PostDeployVerifyBackoff: 10 * time.Millisecond,
|
||||
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
|
||||
PostDeployVerify: &nginx.PostDeployVerifyConfig{
|
||||
Enabled: true,
|
||||
Endpoint: "localhost:443",
|
||||
Timeout: 100 * time.Millisecond,
|
||||
},
|
||||
}
|
||||
c := newConnectorWithStubs(t, cfg)
|
||||
|
||||
var callTimes []time.Time
|
||||
probeCallCount := atomic.Int32{}
|
||||
|
||||
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
|
||||
callTimes = append(callTimes, time.Now())
|
||||
count := probeCallCount.Add(1)
|
||||
if count == 4 {
|
||||
return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(t, certA)}
|
||||
}
|
||||
return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"}
|
||||
})
|
||||
|
||||
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||
CertPEM: certA,
|
||||
KeyPEM: keyA,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("DeployCertificate failed: %v", err)
|
||||
}
|
||||
if !res.Success {
|
||||
t.Fatal("expected Success=true")
|
||||
}
|
||||
|
||||
if len(callTimes) != 4 {
|
||||
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
|
||||
}
|
||||
|
||||
const tolerance = 20 * time.Millisecond
|
||||
expectedGaps := []time.Duration{
|
||||
10 * time.Millisecond,
|
||||
20 * time.Millisecond,
|
||||
40 * time.Millisecond,
|
||||
}
|
||||
|
||||
for i := 0; i < len(expectedGaps); i++ {
|
||||
gap := callTimes[i+1].Sub(callTimes[i])
|
||||
expected := expectedGaps[i]
|
||||
if gap < expected-tolerance || gap > expected+tolerance {
|
||||
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user