diff --git a/docs/deployment-atomicity.md b/docs/deployment-atomicity.md index 66034ee..7ec2054 100644 --- a/docs/deployment-atomicity.md +++ b/docs/deployment-atomicity.md @@ -160,9 +160,12 @@ if res.Fingerprint != certPEMToFingerprint(deployedCertPEM) { } ``` -Retry with backoff (default 3 attempts, 2s exponential) defends +Retry with **exponential backoff** (default 3 attempts; 1s initial, 16s cap) defends against load-balanced targets where the verify might hit a -different pod that hasn't picked up the new cert yet: +different pod that hasn't picked up the new cert yet. Backoff grows 1s → 2s → 4s → 8s → 16s, +giving the LB fleet time to converge before giving up. Operators preserving V2 linear semantics +(every attempt waits the same interval) set `post_deploy_verify_max_backoff` equal to +`post_deploy_verify_backoff`. ```yaml post_deploy_verify: @@ -170,7 +173,8 @@ post_deploy_verify: endpoint: "nginx.svc.cluster.local:443" timeout: 10s post_deploy_verify_attempts: 3 -post_deploy_verify_backoff: 2s +post_deploy_verify_backoff: 1s +post_deploy_verify_max_backoff: 16s ``` ## 5. Rollback semantics diff --git a/internal/connector/target/apache/apache.go b/internal/connector/target/apache/apache.go index 9547911..8923262 100644 --- a/internal/connector/target/apache/apache.go +++ b/internal/connector/target/apache/apache.go @@ -66,9 +66,10 @@ type Config struct { // Phase 5: post-deploy TLS verification (frozen-decision-0.3 // default ON). - PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` - PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` - PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` + PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` + PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"` // Phase 5: backup retention (default 3, -1 to disable). BackupRetention int `json:"backup_retention,omitempty"` @@ -301,38 +302,29 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str if err != nil { return fmt.Errorf("compute deployed cert fingerprint: %w", err) } - attempts := c.config.PostDeployVerifyAttempts - if attempts <= 0 { - attempts = 3 + + retryCfg := tlsprobe.RetryConfig{ + Attempts: c.config.PostDeployVerifyAttempts, + InitialBackoff: c.config.PostDeployVerifyBackoff, + MaxBackoff: c.config.PostDeployVerifyMaxBackoff, } - backoff := c.config.PostDeployVerifyBackoff - if backoff <= 0 { - backoff = 2 * time.Second - } - var lastErr error - for i := 0; i < attempts; i++ { - if i > 0 { - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(backoff): - } - } - res := c.probe(ctx, endpoint, timeout) + + probe := func(probectx context.Context) error { + res := c.probe(probectx, endpoint, timeout) if !res.Success { - lastErr = fmt.Errorf("TLS probe failed: %s", res.Error) - continue + return fmt.Errorf("TLS probe failed: %s", res.Error) } got := strings.ToLower(res.Fingerprint) - want = strings.ToLower(want) - if got == want { - c.logger.Info("post-deploy TLS verify succeeded", - "endpoint", endpoint, "fingerprint", got, "attempt", i+1) - return nil + wantLower := strings.ToLower(want) + if got != wantLower { + return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower) } - lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want) + c.logger.Info("post-deploy TLS verify succeeded", + "endpoint", endpoint, "fingerprint", got) + return nil } - return lastErr + + return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe) } func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error { diff --git a/internal/connector/target/apache/apache_atomic_test.go b/internal/connector/target/apache/apache_atomic_test.go index 1ba59de..5449ef8 100644 --- a/internal/connector/target/apache/apache_atomic_test.go +++ b/internal/connector/target/apache/apache_atomic_test.go @@ -605,3 +605,66 @@ func TestApache_Metadata_Populated(t *testing.T) { // _ avoid unused fixture warning var _ = certB + +// TestApache_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify +// retries with exponential backoff. +func TestApache_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) { + dir := t.TempDir() + cfg := &apache.Config{ + CertPath: filepath.Join(dir, "cert.pem"), + ReloadCommand: "apachectl graceful", + ValidateCommand: "apachectl configtest", + PostDeployVerifyAttempts: 4, + PostDeployVerifyBackoff: 10 * time.Millisecond, + PostDeployVerifyMaxBackoff: 80 * time.Millisecond, + PostDeployVerify: &apache.PostDeployVerifyConfig{ + Enabled: true, + Endpoint: "localhost:443", + Timeout: 100 * time.Millisecond, + }, + } + c := newC(t, cfg) + + var callTimes []time.Time + probeCallCount := atomic.Int32{} + + c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult { + callTimes = append(callTimes, time.Now()) + count := probeCallCount.Add(1) + if count == 4 { + return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(t, certA)} + } + return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"} + }) + + res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{ + CertPEM: certA, + KeyPEM: keyA, + }) + + if err != nil { + t.Fatalf("DeployCertificate failed: %v", err) + } + if !res.Success { + t.Fatal("expected Success=true") + } + + if len(callTimes) != 4 { + t.Fatalf("expected 4 probe calls, got %d", len(callTimes)) + } + + const tolerance = 20 * time.Millisecond + expectedGaps := []time.Duration{ + 10 * time.Millisecond, + 20 * time.Millisecond, + 40 * time.Millisecond, + } + + for i := 0; i < len(expectedGaps); i++ { + gap := callTimes[i+1].Sub(callTimes[i]) + expected := expectedGaps[i] + if gap < expected-tolerance || gap > expected+tolerance { + t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap) + } + } +} diff --git a/internal/connector/target/envoy/envoy.go b/internal/connector/target/envoy/envoy.go index 8468caa..7fb713a 100644 --- a/internal/connector/target/envoy/envoy.go +++ b/internal/connector/target/envoy/envoy.go @@ -39,9 +39,10 @@ type Config struct { // reloaded yet, etc.). Same shape as nginx.go::PostDeployVerify. // Default behavior is opt-in: nil PostDeployVerify or // PostDeployVerify.Enabled=false skips the verify step entirely. - PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` - PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` - PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` + PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` + PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"` // Bundle 3: backup retention. Zero = // deploy.DefaultBackupRetention (3); -1 = disable backups. Mirrors @@ -431,40 +432,28 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str } want = strings.ToLower(want) - attempts := c.config.PostDeployVerifyAttempts - if attempts <= 0 { - attempts = 3 - } - backoff := c.config.PostDeployVerifyBackoff - if backoff <= 0 { - backoff = 2 * time.Second + retryCfg := tlsprobe.RetryConfig{ + Attempts: c.config.PostDeployVerifyAttempts, + InitialBackoff: c.config.PostDeployVerifyBackoff, + MaxBackoff: c.config.PostDeployVerifyMaxBackoff, } - var lastErr error - for i := 0; i < attempts; i++ { - if i > 0 { - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(backoff): - } - } - res := c.probe(ctx, endpoint, timeout) + probe := func(probectx context.Context) error { + res := c.probe(probectx, endpoint, timeout) if !res.Success { - lastErr = fmt.Errorf("TLS probe failed: %s", res.Error) - continue + return fmt.Errorf("TLS probe failed: %s", res.Error) } got := strings.ToLower(res.Fingerprint) - if got == want { - c.logger.Info("post-deploy TLS verify succeeded", - "endpoint", endpoint, - "fingerprint", got, - "attempt", i+1) - return nil + if got != want { + return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want) } - lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want) + c.logger.Info("post-deploy TLS verify succeeded", + "endpoint", endpoint, + "fingerprint", got) + return nil } - return lastErr + + return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe) } // restoreFromBackups iterates the WriteResults from a successful per-file diff --git a/internal/connector/target/envoy/envoy_atomic_test.go b/internal/connector/target/envoy/envoy_atomic_test.go index 706b785..b3bdf95 100644 --- a/internal/connector/target/envoy/envoy_atomic_test.go +++ b/internal/connector/target/envoy/envoy_atomic_test.go @@ -331,3 +331,62 @@ func entryNames(entries []os.DirEntry) []string { } return names } + +// TestEnvoy_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify +// retries with exponential backoff (Top-10 fix #8). 4 attempts, 10ms initial, +// 80ms cap; expected gaps 10ms, 20ms, 40ms. +func TestEnvoy_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) { + dir := t.TempDir() + cfg := envoy.Config{ + CertDir: dir, + CertFilename: "cert.pem", + KeyFilename: "key.pem", + PostDeployVerify: &envoy.PostDeployVerifyConfig{ + Enabled: true, + Endpoint: "envoy.test.invalid:443", + Timeout: 100 * time.Millisecond, + }, + PostDeployVerifyAttempts: 4, + PostDeployVerifyBackoff: 10 * time.Millisecond, + PostDeployVerifyMaxBackoff: 80 * time.Millisecond, + } + c := envoy.New(&cfg, newTestLogger()) + + want := certPEMFingerprint(t, certA) + var callTimes []time.Time + calls := atomic.Int64{} + c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult { + callTimes = append(callTimes, time.Now()) + n := calls.Add(1) + if n == 4 { + return tlsprobe.ProbeResult{Success: true, Fingerprint: want} + } + return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"} + }) + + res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{ + CertPEM: certA, KeyPEM: keyA, + }) + if err != nil { + t.Fatalf("DeployCertificate failed: %v", err) + } + if !res.Success { + t.Fatal("expected Success=true") + } + if len(callTimes) != 4 { + t.Fatalf("expected 4 probe calls, got %d", len(callTimes)) + } + const tolerance = 25 * time.Millisecond + expectedGaps := []time.Duration{ + 10 * time.Millisecond, + 20 * time.Millisecond, + 40 * time.Millisecond, + } + for i := 0; i < len(expectedGaps); i++ { + gap := callTimes[i+1].Sub(callTimes[i]) + expected := expectedGaps[i] + if gap < expected-tolerance || gap > expected+tolerance { + t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap) + } + } +} diff --git a/internal/connector/target/haproxy/haproxy.go b/internal/connector/target/haproxy/haproxy.go index 68f8da0..7426335 100644 --- a/internal/connector/target/haproxy/haproxy.go +++ b/internal/connector/target/haproxy/haproxy.go @@ -52,9 +52,10 @@ type Config struct { PEMFileOwner string `json:"pem_file_owner,omitempty"` PEMFileGroup string `json:"pem_file_group,omitempty"` - PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` - PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` - PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` + PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` + PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"` BackupRetention int `json:"backup_retention,omitempty"` } @@ -259,38 +260,29 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str if err != nil { return fmt.Errorf("compute deployed cert fingerprint: %w", err) } - attempts := c.config.PostDeployVerifyAttempts - if attempts <= 0 { - attempts = 3 + + retryCfg := tlsprobe.RetryConfig{ + Attempts: c.config.PostDeployVerifyAttempts, + InitialBackoff: c.config.PostDeployVerifyBackoff, + MaxBackoff: c.config.PostDeployVerifyMaxBackoff, } - backoff := c.config.PostDeployVerifyBackoff - if backoff <= 0 { - backoff = 2 * time.Second - } - var lastErr error - for i := 0; i < attempts; i++ { - if i > 0 { - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(backoff): - } - } - res := c.probe(ctx, endpoint, timeout) + + probe := func(probectx context.Context) error { + res := c.probe(probectx, endpoint, timeout) if !res.Success { - lastErr = fmt.Errorf("TLS probe failed: %s", res.Error) - continue + return fmt.Errorf("TLS probe failed: %s", res.Error) } got := strings.ToLower(res.Fingerprint) - want = strings.ToLower(want) - if got == want { - c.logger.Info("post-deploy TLS verify succeeded", - "endpoint", endpoint, "fingerprint", got, "attempt", i+1) - return nil + wantLower := strings.ToLower(want) + if got != wantLower { + return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower) } - lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want) + c.logger.Info("post-deploy TLS verify succeeded", + "endpoint", endpoint, "fingerprint", got) + return nil } - return lastErr + + return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe) } func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error { diff --git a/internal/connector/target/haproxy/haproxy_atomic_test.go b/internal/connector/target/haproxy/haproxy_atomic_test.go index acbf80e..2e2cb8d 100644 --- a/internal/connector/target/haproxy/haproxy_atomic_test.go +++ b/internal/connector/target/haproxy/haproxy_atomic_test.go @@ -592,3 +592,66 @@ func TestHAProxy_ValidateConfig_RejectsInjection(t *testing.T) { t.Error("expected injection error") } } + +// TestHAProxy_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify +// retries with exponential backoff. +func TestHAProxy_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) { + dir := t.TempDir() + pemPath := filepath.Join(dir, "cert.pem") + cfg := &haproxy.Config{ + PEMPath: pemPath, + ReloadCommand: "systemctl reload haproxy", + PostDeployVerifyAttempts: 4, + PostDeployVerifyBackoff: 10 * time.Millisecond, + PostDeployVerifyMaxBackoff: 80 * time.Millisecond, + PostDeployVerify: &haproxy.PostDeployVerifyConfig{ + Enabled: true, + Endpoint: "localhost:443", + Timeout: 100 * time.Millisecond, + }, + } + c := newC(t, cfg) + + var callTimes []time.Time + probeCallCount := atomic.Int32{} + + c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult { + callTimes = append(callTimes, time.Now()) + count := probeCallCount.Add(1) + if count == 4 { + return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(t, certA)} + } + return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"} + }) + + res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{ + CertPEM: certA, + KeyPEM: keyA, + }) + + if err != nil { + t.Fatalf("DeployCertificate failed: %v", err) + } + if !res.Success { + t.Fatal("expected Success=true") + } + + if len(callTimes) != 4 { + t.Fatalf("expected 4 probe calls, got %d", len(callTimes)) + } + + const tolerance = 20 * time.Millisecond + expectedGaps := []time.Duration{ + 10 * time.Millisecond, + 20 * time.Millisecond, + 40 * time.Millisecond, + } + + for i := 0; i < len(expectedGaps); i++ { + gap := callTimes[i+1].Sub(callTimes[i]) + expected := expectedGaps[i] + if gap < expected-tolerance || gap > expected+tolerance { + t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap) + } + } +} diff --git a/internal/connector/target/nginx/nginx.go b/internal/connector/target/nginx/nginx.go index 029f01a..78da5df 100644 --- a/internal/connector/target/nginx/nginx.go +++ b/internal/connector/target/nginx/nginx.go @@ -82,9 +82,10 @@ type Config struct { KeyFileGroup string `json:"key_file_group,omitempty"` // Phase 4 (deploy-hardening I): post-deploy TLS verification. - PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` - PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` - PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` + PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` + PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"` // Phase 4 (deploy-hardening I): backup retention. Zero = // deploy.DefaultBackupRetention (3); -1 = disable backups (no @@ -443,41 +444,29 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str return fmt.Errorf("compute deployed cert fingerprint: %w", err) } - attempts := c.config.PostDeployVerifyAttempts - if attempts <= 0 { - attempts = 3 - } - backoff := c.config.PostDeployVerifyBackoff - if backoff <= 0 { - backoff = 2 * time.Second + retryCfg := tlsprobe.RetryConfig{ + Attempts: c.config.PostDeployVerifyAttempts, + InitialBackoff: c.config.PostDeployVerifyBackoff, + MaxBackoff: c.config.PostDeployVerifyMaxBackoff, } - var lastErr error - for i := 0; i < attempts; i++ { - if i > 0 { - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(backoff): - } - } - res := c.probe(ctx, endpoint, timeout) + probe := func(probectx context.Context) error { + res := c.probe(probectx, endpoint, timeout) if !res.Success { - lastErr = fmt.Errorf("TLS probe failed: %s", res.Error) - continue + return fmt.Errorf("TLS probe failed: %s", res.Error) } got := strings.ToLower(res.Fingerprint) - want = strings.ToLower(want) - if got == want { - c.logger.Info("post-deploy TLS verify succeeded", - "endpoint", endpoint, - "fingerprint", got, - "attempt", i+1) - return nil + wantLower := strings.ToLower(want) + if got != wantLower { + return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower) } - lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want) + c.logger.Info("post-deploy TLS verify succeeded", + "endpoint", endpoint, + "fingerprint", got) + return nil } - return lastErr + + return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe) } // rollbackToBackups manually triggers a restore by overwriting diff --git a/internal/connector/target/nginx/nginx_atomic_test.go b/internal/connector/target/nginx/nginx_atomic_test.go index 499b995..4e065b6 100644 --- a/internal/connector/target/nginx/nginx_atomic_test.go +++ b/internal/connector/target/nginx/nginx_atomic_test.go @@ -1215,3 +1215,66 @@ func TestNginx_Atomic_FreshDeploy_IdempotentFlagFalse(t *testing.T) { t.Errorf("idempotent = %q, want false", res.Metadata["idempotent"]) } } + +// TestNginx_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify +// retries with exponential backoff (10ms → 20ms → 40ms up to max). +func TestNginx_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) { + dir := t.TempDir() + cfg := &nginx.Config{ + CertPath: filepath.Join(dir, "cert.pem"), + ReloadCommand: "nginx -s reload", + ValidateCommand: "nginx -t", + PostDeployVerifyAttempts: 4, + PostDeployVerifyBackoff: 10 * time.Millisecond, + PostDeployVerifyMaxBackoff: 80 * time.Millisecond, + PostDeployVerify: &nginx.PostDeployVerifyConfig{ + Enabled: true, + Endpoint: "localhost:443", + Timeout: 100 * time.Millisecond, + }, + } + c := newConnectorWithStubs(t, cfg) + + var callTimes []time.Time + probeCallCount := atomic.Int32{} + + c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult { + callTimes = append(callTimes, time.Now()) + count := probeCallCount.Add(1) + if count == 4 { + return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(t, certA)} + } + return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"} + }) + + res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{ + CertPEM: certA, + KeyPEM: keyA, + }) + + if err != nil { + t.Fatalf("DeployCertificate failed: %v", err) + } + if !res.Success { + t.Fatal("expected Success=true") + } + + if len(callTimes) != 4 { + t.Fatalf("expected 4 probe calls, got %d", len(callTimes)) + } + + const tolerance = 20 * time.Millisecond + expectedGaps := []time.Duration{ + 10 * time.Millisecond, + 20 * time.Millisecond, + 40 * time.Millisecond, + } + + for i := 0; i < len(expectedGaps); i++ { + gap := callTimes[i+1].Sub(callTimes[i]) + expected := expectedGaps[i] + if gap < expected-tolerance || gap > expected+tolerance { + t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap) + } + } +} diff --git a/internal/connector/target/postfix/postfix.go b/internal/connector/target/postfix/postfix.go index 548d89c..cb620ff 100644 --- a/internal/connector/target/postfix/postfix.go +++ b/internal/connector/target/postfix/postfix.go @@ -37,17 +37,18 @@ type Config struct { ValidateCommand string `json:"validate_command"` // Phase 7: file ownership + mode + verify + retention. - CertFileMode os.FileMode `json:"cert_file_mode,omitempty"` - KeyFileMode os.FileMode `json:"key_file_mode,omitempty"` - ChainFileMode os.FileMode `json:"chain_file_mode,omitempty"` - CertFileOwner string `json:"cert_file_owner,omitempty"` - CertFileGroup string `json:"cert_file_group,omitempty"` - KeyFileOwner string `json:"key_file_owner,omitempty"` - KeyFileGroup string `json:"key_file_group,omitempty"` - PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` - PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` - PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` - BackupRetention int `json:"backup_retention,omitempty"` + CertFileMode os.FileMode `json:"cert_file_mode,omitempty"` + KeyFileMode os.FileMode `json:"key_file_mode,omitempty"` + ChainFileMode os.FileMode `json:"chain_file_mode,omitempty"` + CertFileOwner string `json:"cert_file_owner,omitempty"` + CertFileGroup string `json:"cert_file_group,omitempty"` + KeyFileOwner string `json:"key_file_owner,omitempty"` + KeyFileGroup string `json:"key_file_group,omitempty"` + PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` + PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` + PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"` + BackupRetention int `json:"backup_retention,omitempty"` } type PostDeployVerifyConfig struct { @@ -289,36 +290,27 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str if err != nil { return fmt.Errorf("compute deployed cert fingerprint: %w", err) } - attempts := c.config.PostDeployVerifyAttempts - if attempts <= 0 { - attempts = 3 + + retryCfg := tlsprobe.RetryConfig{ + Attempts: c.config.PostDeployVerifyAttempts, + InitialBackoff: c.config.PostDeployVerifyBackoff, + MaxBackoff: c.config.PostDeployVerifyMaxBackoff, } - backoff := c.config.PostDeployVerifyBackoff - if backoff <= 0 { - backoff = 2 * time.Second - } - var lastErr error - for i := 0; i < attempts; i++ { - if i > 0 { - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(backoff): - } - } - res := c.probe(ctx, endpoint, timeout) + + probe := func(probectx context.Context) error { + res := c.probe(probectx, endpoint, timeout) if !res.Success { - lastErr = fmt.Errorf("TLS probe failed: %s", res.Error) - continue + return fmt.Errorf("TLS probe failed: %s", res.Error) } got := strings.ToLower(res.Fingerprint) - want = strings.ToLower(want) - if got == want { - return nil + wantLower := strings.ToLower(want) + if got != wantLower { + return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower) } - lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want) + return nil } - return lastErr + + return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe) } func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error { diff --git a/internal/connector/target/postfix/postfix_atomic_test.go b/internal/connector/target/postfix/postfix_atomic_test.go index 6d6df41..00ac32b 100644 --- a/internal/connector/target/postfix/postfix_atomic_test.go +++ b/internal/connector/target/postfix/postfix_atomic_test.go @@ -430,3 +430,68 @@ func TestPostfix_Atomic_DovecotMode_VerifyFails_Rollback(t *testing.T) { t.Errorf("rollback did not restore original key bytes:\n got: %q\n want: %q", gotKey, origKey) } } + +func TestPostfix_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) { + dir := t.TempDir() + c := newC(t, &postfix.Config{ + Mode: "postfix", + CertPath: filepath.Join(dir, "cert.pem"), + ReloadCommand: "postfix reload", + PostDeployVerifyAttempts: 4, + PostDeployVerifyBackoff: 10 * time.Millisecond, + PostDeployVerifyMaxBackoff: 80 * time.Millisecond, + PostDeployVerify: &postfix.PostDeployVerifyConfig{ + Enabled: true, + Endpoint: "localhost:25", + Timeout: 100 * time.Millisecond, + }, + }) + + var callTimes []time.Time + probeCallCount := atomic.Int32{} + + c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult { + callTimes = append(callTimes, time.Now()) + count := probeCallCount.Add(1) + if count == 4 { + // Succeed on 4th attempt + return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(certA)} + } + // Fail on attempts 1-3 + return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"} + }) + + res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{ + CertPEM: certA, + KeyPEM: keyA, + }) + + if err != nil { + t.Fatalf("DeployCertificate failed: %v", err) + } + if !res.Success { + t.Fatal("expected Success=true") + } + + // Verify we made exactly 4 calls + if len(callTimes) != 4 { + t.Fatalf("expected 4 probe calls, got %d", len(callTimes)) + } + + // Assert gaps between attempts are approximately 10ms, 20ms, 40ms. + // Allow ±20ms tolerance for scheduler noise. + const tolerance = 20 * time.Millisecond + expectedGaps := []time.Duration{ + 10 * time.Millisecond, + 20 * time.Millisecond, + 40 * time.Millisecond, + } + + for i := 0; i < len(expectedGaps); i++ { + gap := callTimes[i+1].Sub(callTimes[i]) + expected := expectedGaps[i] + if gap < expected-tolerance || gap > expected+tolerance { + t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap) + } + } +} diff --git a/internal/connector/target/traefik/traefik.go b/internal/connector/target/traefik/traefik.go index 4626184..1d9aee2 100644 --- a/internal/connector/target/traefik/traefik.go +++ b/internal/connector/target/traefik/traefik.go @@ -35,16 +35,17 @@ type Config struct { // Phase 7: per-file mode/owner overrides + post-deploy verify // + backup retention. - CertFileMode os.FileMode `json:"cert_file_mode,omitempty"` - KeyFileMode os.FileMode `json:"key_file_mode,omitempty"` - CertFileOwner string `json:"cert_file_owner,omitempty"` - CertFileGroup string `json:"cert_file_group,omitempty"` - KeyFileOwner string `json:"key_file_owner,omitempty"` - KeyFileGroup string `json:"key_file_group,omitempty"` - PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` - PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` - PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` - BackupRetention int `json:"backup_retention,omitempty"` + CertFileMode os.FileMode `json:"cert_file_mode,omitempty"` + KeyFileMode os.FileMode `json:"key_file_mode,omitempty"` + CertFileOwner string `json:"cert_file_owner,omitempty"` + CertFileGroup string `json:"cert_file_group,omitempty"` + KeyFileOwner string `json:"key_file_owner,omitempty"` + KeyFileGroup string `json:"key_file_group,omitempty"` + PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` + PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` + PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` + PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"` + BackupRetention int `json:"backup_retention,omitempty"` } type PostDeployVerifyConfig struct { @@ -224,34 +225,25 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str if err != nil { return fmt.Errorf("compute fingerprint: %w", err) } - attempts := c.config.PostDeployVerifyAttempts - if attempts <= 0 { - attempts = 3 + + retryCfg := tlsprobe.RetryConfig{ + Attempts: c.config.PostDeployVerifyAttempts, + InitialBackoff: c.config.PostDeployVerifyBackoff, + MaxBackoff: c.config.PostDeployVerifyMaxBackoff, } - backoff := c.config.PostDeployVerifyBackoff - if backoff <= 0 { - backoff = 2 * time.Second - } - var lastErr error - for i := 0; i < attempts; i++ { - if i > 0 { - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(backoff): - } - } - res := c.probe(ctx, verify.Endpoint, timeout) + + probe := func(probectx context.Context) error { + res := c.probe(probectx, verify.Endpoint, timeout) if !res.Success { - lastErr = fmt.Errorf("TLS probe failed: %s", res.Error) - continue + return fmt.Errorf("TLS probe failed: %s", res.Error) } - if strings.EqualFold(res.Fingerprint, want) { - return nil + if !strings.EqualFold(res.Fingerprint, want) { + return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", res.Fingerprint, want) } - lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", res.Fingerprint, want) + return nil } - return lastErr + + return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe) } // restoreFromBackups iterates the BackupPaths returned by deploy.Apply diff --git a/internal/connector/target/traefik/traefik_atomic_test.go b/internal/connector/target/traefik/traefik_atomic_test.go index 99f4b0c..f38d26d 100644 --- a/internal/connector/target/traefik/traefik_atomic_test.go +++ b/internal/connector/target/traefik/traefik_atomic_test.go @@ -399,3 +399,56 @@ func TestTraefik_Atomic_VerifyMismatch_BothFilesRollBack(t *testing.T) { t.Errorf("key not restored on rollback (Bundle 4 regression: pre-fix this would fail because rollbackCertAndKey ignored the key): got %q want %q", string(gotKey), sentinelKey) } } + +// TestTraefik_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify +// retries with exponential backoff (Top-10 fix #8). 4 attempts, 10ms initial, +// 80ms cap; expected gaps 10ms, 20ms, 40ms. +func TestTraefik_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) { + dir := t.TempDir() + c := traefik.New(&traefik.Config{ + CertDir: dir, CertFile: "cert.pem", KeyFile: "key.pem", + PostDeployVerifyAttempts: 4, + PostDeployVerifyBackoff: 10 * time.Millisecond, + PostDeployVerifyMaxBackoff: 80 * time.Millisecond, + PostDeployVerify: &traefik.PostDeployVerifyConfig{ + Enabled: true, Endpoint: "h:443", Timeout: 100 * time.Millisecond, + }, + }, quietLogger()) + + var callTimes []time.Time + probeCallCount := atomic.Int32{} + c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult { + callTimes = append(callTimes, time.Now()) + count := probeCallCount.Add(1) + if count == 4 { + return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(certA)} + } + return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"} + }) + + res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{ + CertPEM: certA, KeyPEM: keyA, + }) + if err != nil { + t.Fatalf("DeployCertificate failed: %v", err) + } + if !res.Success { + t.Fatal("expected Success=true") + } + if len(callTimes) != 4 { + t.Fatalf("expected 4 probe calls, got %d", len(callTimes)) + } + const tolerance = 25 * time.Millisecond + expectedGaps := []time.Duration{ + 10 * time.Millisecond, + 20 * time.Millisecond, + 40 * time.Millisecond, + } + for i := 0; i < len(expectedGaps); i++ { + gap := callTimes[i+1].Sub(callTimes[i]) + expected := expectedGaps[i] + if gap < expected-tolerance || gap > expected+tolerance { + t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap) + } + } +} diff --git a/internal/tlsprobe/retry.go b/internal/tlsprobe/retry.go new file mode 100644 index 0000000..df1be4d --- /dev/null +++ b/internal/tlsprobe/retry.go @@ -0,0 +1,68 @@ +// Copyright (c) 2025 Certctl Contributors +// +// SPDX-License-Identifier: BSL-1.1 +// See COPYING for license details. + +package tlsprobe + +import ( + "context" + "time" +) + +// RetryConfig holds parameters for exponential-backoff retries. +// Zero values use defaults: 3 attempts, 1s initial, 16s max. +type RetryConfig struct { + Attempts int // total attempts; 0 = use 3 default + InitialBackoff time.Duration // base; 0 = use 1 * time.Second default + MaxBackoff time.Duration // cap; 0 = use 16 * time.Second default +} + +// VerifyWithExponentialBackoff calls the probe at most cfg.Attempts times, +// waiting cfg.InitialBackoff, 2*InitialBackoff, 4*InitialBackoff, ... capped at +// cfg.MaxBackoff between consecutive attempts. Returns nil on first probe success; +// returns the last attempt's error on full exhaustion. +// +// The probe function returns: +// - nil error on success → return immediately, no further attempts. +// - non-nil error → wait the exponentially-growing backoff and retry. +// +// The ctx is checked between attempts; ctx cancellation aborts immediately. +// +// Top-10 fix #8 of the 2026-05-02 deployment-target audit re-run. +func VerifyWithExponentialBackoff(ctx context.Context, cfg RetryConfig, probe func(ctx context.Context) error) error { + attempts := cfg.Attempts + if attempts <= 0 { + attempts = 3 + } + initial := cfg.InitialBackoff + if initial <= 0 { + initial = 1 * time.Second + } + max := cfg.MaxBackoff + if max <= 0 { + max = 16 * time.Second + } + + backoff := initial + var lastErr error + for i := 0; i < attempts; i++ { + if i > 0 { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(backoff): + } + backoff *= 2 + if backoff > max { + backoff = max + } + } + if err := probe(ctx); err == nil { + return nil + } else { + lastErr = err + } + } + return lastErr +} diff --git a/internal/tlsprobe/retry_test.go b/internal/tlsprobe/retry_test.go new file mode 100644 index 0000000..90abd84 --- /dev/null +++ b/internal/tlsprobe/retry_test.go @@ -0,0 +1,129 @@ +// Copyright (c) 2025 Certctl Contributors +// +// SPDX-License-Identifier: BSL-1.1 +// See COPYING for license details. + +package tlsprobe + +import ( + "context" + "errors" + "testing" + "time" +) + +func TestVerifyWithExponentialBackoff_GrowthAndCap(t *testing.T) { + cfg := RetryConfig{ + Attempts: 5, + InitialBackoff: 10 * time.Millisecond, + MaxBackoff: 40 * time.Millisecond, + } + + var callTimes []time.Time + probe := func(ctx context.Context) error { + callTimes = append(callTimes, time.Now()) + return errors.New("always fail") + } + + ctx := context.Background() + start := time.Now() + err := VerifyWithExponentialBackoff(ctx, cfg, probe) + + if err == nil { + t.Fatal("expected error, got nil") + } + if len(callTimes) != 5 { + t.Fatalf("expected 5 calls, got %d", len(callTimes)) + } + + // Assert gaps between attempts are approximately: 10ms, 20ms, 40ms, 40ms. + // Allow ±20ms tolerance for scheduler noise. + const tolerance = 20 * time.Millisecond + expectedGaps := []time.Duration{ + 10 * time.Millisecond, + 20 * time.Millisecond, + 40 * time.Millisecond, + 40 * time.Millisecond, + } + + for i := 0; i < len(expectedGaps); i++ { + gap := callTimes[i+1].Sub(callTimes[i]) + expected := expectedGaps[i] + if gap < expected-tolerance || gap > expected+tolerance { + t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap) + } + } + + // Total wall time should be ~10+20+40+40 = 110ms + totalTime := time.Since(start) + expectedTotal := 110 * time.Millisecond + if totalTime < expectedTotal-50*time.Millisecond || totalTime > expectedTotal+100*time.Millisecond { + t.Errorf("total time: expected ~%v, got %v", expectedTotal, totalTime) + } +} + +func TestVerifyWithExponentialBackoff_StopsOnFirstSuccess(t *testing.T) { + cfg := RetryConfig{ + Attempts: 3, + InitialBackoff: 10 * time.Millisecond, + MaxBackoff: 40 * time.Millisecond, + } + + var callCount int + probe := func(ctx context.Context) error { + callCount++ + if callCount == 2 { + return nil // success on second attempt + } + return errors.New("failed") + } + + ctx := context.Background() + start := time.Now() + err := VerifyWithExponentialBackoff(ctx, cfg, probe) + + if err != nil { + t.Fatalf("expected nil, got error: %v", err) + } + if callCount != 2 { + t.Fatalf("expected 2 calls, got %d", callCount) + } + + // Total wall time should be ~10ms (one wait between attempt 1 and 2). + totalTime := time.Since(start) + const tolerance = 20 * time.Millisecond + if totalTime > tolerance { + t.Errorf("total time: expected <~20ms, got %v", totalTime) + } +} + +func TestVerifyWithExponentialBackoff_CtxCancellation(t *testing.T) { + cfg := RetryConfig{ + Attempts: 5, + InitialBackoff: 100 * time.Millisecond, + MaxBackoff: 1000 * time.Millisecond, + } + + var callCount int + probe := func(ctx context.Context) error { + callCount++ + return errors.New("always fail") + } + + ctx, cancel := context.WithCancel(context.Background()) + // Cancel after allowing first attempt + partial wait + go func() { + time.Sleep(20 * time.Millisecond) + cancel() + }() + + err := VerifyWithExponentialBackoff(ctx, cfg, probe) + + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled, got: %v", err) + } + // Should have completed first attempt, then been cancelled during wait + if callCount != 1 { + t.Fatalf("expected 1 call before cancellation, got %d", callCount) + } +}