mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-13 03:50:22 +00:00
tlsprobe: add VerifyWithExponentialBackoff + rewire all connectors' runPostDeployVerify
Closes Top-10 fix #8 of the 2026-05-02 deployment-target audit re-run (see cowork/deployment-target-audit-2026-05-02-rerun/ RESULTS.md). Pre-fix, every connector's runPostDeployVerify used linear backoff (default 3 attempts × 2s linear waits). Linear backoff misbehaves under load-balanced rollouts: the verify probe hits a random LB-backed pod, and 3 × 2s often falls into the worst case where match-fingerprint pods stop responding by attempt 3 due to LB session-stickiness cycles. This commit: 1. New shared helper internal/tlsprobe/retry.go:: VerifyWithExponentialBackoff. Default 3 attempts; 1s initial, 16s cap. Doubling pattern: 1s → 2s → 4s → 8s → 16s. probe func(ctx) error signature so connectors compose handshake + fingerprint-compare into one lambda. 2. Each connector's runPostDeployVerify (nginx, apache, haproxy, traefik, envoy, postfix, dovecot) rewired to call the shared helper. Per-connector signature unchanged. 3. New PostDeployVerifyMaxBackoff time.Duration field added to each connector's Config. Operators preserving V2 linear behavior set PostDeployVerifyMaxBackoff equal to PostDeployVerifyBackoff. 4. Tests: - tlsprobe/retry_test.go: TestVerifyWithExponentialBackoff_ GrowthAndCap + TestVerifyWithExponentialBackoff_ StopsOnFirstSuccess + TestVerifyWithExponentialBackoff_ CtxCancellation. - One Test<Connector>_VerifyExponentialBackoff_ GrowsBetweenAttempts per connector (6 total across postfix, nginx, apache, haproxy; traefik and envoy connectors use unique test signatures so test wiring deferred to future unification). 5. docs/deployment-atomicity.md Section 4 updated: 'linear backoff' → 'exponential backoff (1s → 16s cap)'; YAML example shows the new field. Backward-compat note: PostDeployVerifyBackoff was interpreted as the linear interval pre-fix; post-fix it's interpreted as the initial backoff (which doubles each attempt). Operators using the default value (2s) see waits of 2s → 4s → 8s instead of 2s → 2s → 2s. For LB-rollout cases this is the intended behavior; for single-target deploys the wall-clock is slightly longer (12s vs 6s for 3 attempts). Operators preserving V2 linear semantics: set PostDeployVerifyMaxBackoff equal to PostDeployVerifyBackoff. Verified locally: - gofmt clean. - go test -short -count=1 ./internal/tlsprobe/... ./internal/connector/target/{postfix,nginx,apache,haproxy}/... green. Audit reference: cowork/deployment-target-audit-2026-05-02-rerun/ RESULTS.md Top-10 fix #8.
This commit is contained in:
@@ -160,9 +160,12 @@ if res.Fingerprint != certPEMToFingerprint(deployedCertPEM) {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Retry with backoff (default 3 attempts, 2s exponential) defends
|
Retry with **exponential backoff** (default 3 attempts; 1s initial, 16s cap) defends
|
||||||
against load-balanced targets where the verify might hit a
|
against load-balanced targets where the verify might hit a
|
||||||
different pod that hasn't picked up the new cert yet:
|
different pod that hasn't picked up the new cert yet. Backoff grows 1s → 2s → 4s → 8s → 16s,
|
||||||
|
giving the LB fleet time to converge before giving up. Operators preserving V2 linear semantics
|
||||||
|
(every attempt waits the same interval) set `post_deploy_verify_max_backoff` equal to
|
||||||
|
`post_deploy_verify_backoff`.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
post_deploy_verify:
|
post_deploy_verify:
|
||||||
@@ -170,7 +173,8 @@ post_deploy_verify:
|
|||||||
endpoint: "nginx.svc.cluster.local:443"
|
endpoint: "nginx.svc.cluster.local:443"
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
post_deploy_verify_attempts: 3
|
post_deploy_verify_attempts: 3
|
||||||
post_deploy_verify_backoff: 2s
|
post_deploy_verify_backoff: 1s
|
||||||
|
post_deploy_verify_max_backoff: 16s
|
||||||
```
|
```
|
||||||
|
|
||||||
## 5. Rollback semantics
|
## 5. Rollback semantics
|
||||||
|
|||||||
@@ -66,9 +66,10 @@ type Config struct {
|
|||||||
|
|
||||||
// Phase 5: post-deploy TLS verification (frozen-decision-0.3
|
// Phase 5: post-deploy TLS verification (frozen-decision-0.3
|
||||||
// default ON).
|
// default ON).
|
||||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||||
|
PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"`
|
||||||
|
|
||||||
// Phase 5: backup retention (default 3, -1 to disable).
|
// Phase 5: backup retention (default 3, -1 to disable).
|
||||||
BackupRetention int `json:"backup_retention,omitempty"`
|
BackupRetention int `json:"backup_retention,omitempty"`
|
||||||
@@ -301,38 +302,29 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
||||||
}
|
}
|
||||||
attempts := c.config.PostDeployVerifyAttempts
|
|
||||||
if attempts <= 0 {
|
retryCfg := tlsprobe.RetryConfig{
|
||||||
attempts = 3
|
Attempts: c.config.PostDeployVerifyAttempts,
|
||||||
|
InitialBackoff: c.config.PostDeployVerifyBackoff,
|
||||||
|
MaxBackoff: c.config.PostDeployVerifyMaxBackoff,
|
||||||
}
|
}
|
||||||
backoff := c.config.PostDeployVerifyBackoff
|
|
||||||
if backoff <= 0 {
|
probe := func(probectx context.Context) error {
|
||||||
backoff = 2 * time.Second
|
res := c.probe(probectx, endpoint, timeout)
|
||||||
}
|
|
||||||
var lastErr error
|
|
||||||
for i := 0; i < attempts; i++ {
|
|
||||||
if i > 0 {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-time.After(backoff):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
res := c.probe(ctx, endpoint, timeout)
|
|
||||||
if !res.Success {
|
if !res.Success {
|
||||||
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
|
return fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
got := strings.ToLower(res.Fingerprint)
|
got := strings.ToLower(res.Fingerprint)
|
||||||
want = strings.ToLower(want)
|
wantLower := strings.ToLower(want)
|
||||||
if got == want {
|
if got != wantLower {
|
||||||
c.logger.Info("post-deploy TLS verify succeeded",
|
return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower)
|
||||||
"endpoint", endpoint, "fingerprint", got, "attempt", i+1)
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want)
|
c.logger.Info("post-deploy TLS verify succeeded",
|
||||||
|
"endpoint", endpoint, "fingerprint", got)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
return lastErr
|
|
||||||
|
return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error {
|
func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error {
|
||||||
|
|||||||
@@ -605,3 +605,66 @@ func TestApache_Metadata_Populated(t *testing.T) {
|
|||||||
|
|
||||||
// _ avoid unused fixture warning
|
// _ avoid unused fixture warning
|
||||||
var _ = certB
|
var _ = certB
|
||||||
|
|
||||||
|
// TestApache_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify
|
||||||
|
// retries with exponential backoff.
|
||||||
|
func TestApache_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
cfg := &apache.Config{
|
||||||
|
CertPath: filepath.Join(dir, "cert.pem"),
|
||||||
|
ReloadCommand: "apachectl graceful",
|
||||||
|
ValidateCommand: "apachectl configtest",
|
||||||
|
PostDeployVerifyAttempts: 4,
|
||||||
|
PostDeployVerifyBackoff: 10 * time.Millisecond,
|
||||||
|
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
|
||||||
|
PostDeployVerify: &apache.PostDeployVerifyConfig{
|
||||||
|
Enabled: true,
|
||||||
|
Endpoint: "localhost:443",
|
||||||
|
Timeout: 100 * time.Millisecond,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
c := newC(t, cfg)
|
||||||
|
|
||||||
|
var callTimes []time.Time
|
||||||
|
probeCallCount := atomic.Int32{}
|
||||||
|
|
||||||
|
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
|
||||||
|
callTimes = append(callTimes, time.Now())
|
||||||
|
count := probeCallCount.Add(1)
|
||||||
|
if count == 4 {
|
||||||
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(t, certA)}
|
||||||
|
}
|
||||||
|
return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"}
|
||||||
|
})
|
||||||
|
|
||||||
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||||
|
CertPEM: certA,
|
||||||
|
KeyPEM: keyA,
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("DeployCertificate failed: %v", err)
|
||||||
|
}
|
||||||
|
if !res.Success {
|
||||||
|
t.Fatal("expected Success=true")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(callTimes) != 4 {
|
||||||
|
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
|
||||||
|
}
|
||||||
|
|
||||||
|
const tolerance = 20 * time.Millisecond
|
||||||
|
expectedGaps := []time.Duration{
|
||||||
|
10 * time.Millisecond,
|
||||||
|
20 * time.Millisecond,
|
||||||
|
40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(expectedGaps); i++ {
|
||||||
|
gap := callTimes[i+1].Sub(callTimes[i])
|
||||||
|
expected := expectedGaps[i]
|
||||||
|
if gap < expected-tolerance || gap > expected+tolerance {
|
||||||
|
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -39,9 +39,10 @@ type Config struct {
|
|||||||
// reloaded yet, etc.). Same shape as nginx.go::PostDeployVerify.
|
// reloaded yet, etc.). Same shape as nginx.go::PostDeployVerify.
|
||||||
// Default behavior is opt-in: nil PostDeployVerify or
|
// Default behavior is opt-in: nil PostDeployVerify or
|
||||||
// PostDeployVerify.Enabled=false skips the verify step entirely.
|
// PostDeployVerify.Enabled=false skips the verify step entirely.
|
||||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||||
|
PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"`
|
||||||
|
|
||||||
// Bundle 3: backup retention. Zero =
|
// Bundle 3: backup retention. Zero =
|
||||||
// deploy.DefaultBackupRetention (3); -1 = disable backups. Mirrors
|
// deploy.DefaultBackupRetention (3); -1 = disable backups. Mirrors
|
||||||
@@ -431,40 +432,28 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str
|
|||||||
}
|
}
|
||||||
want = strings.ToLower(want)
|
want = strings.ToLower(want)
|
||||||
|
|
||||||
attempts := c.config.PostDeployVerifyAttempts
|
retryCfg := tlsprobe.RetryConfig{
|
||||||
if attempts <= 0 {
|
Attempts: c.config.PostDeployVerifyAttempts,
|
||||||
attempts = 3
|
InitialBackoff: c.config.PostDeployVerifyBackoff,
|
||||||
}
|
MaxBackoff: c.config.PostDeployVerifyMaxBackoff,
|
||||||
backoff := c.config.PostDeployVerifyBackoff
|
|
||||||
if backoff <= 0 {
|
|
||||||
backoff = 2 * time.Second
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var lastErr error
|
probe := func(probectx context.Context) error {
|
||||||
for i := 0; i < attempts; i++ {
|
res := c.probe(probectx, endpoint, timeout)
|
||||||
if i > 0 {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-time.After(backoff):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
res := c.probe(ctx, endpoint, timeout)
|
|
||||||
if !res.Success {
|
if !res.Success {
|
||||||
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
|
return fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
got := strings.ToLower(res.Fingerprint)
|
got := strings.ToLower(res.Fingerprint)
|
||||||
if got == want {
|
if got != want {
|
||||||
c.logger.Info("post-deploy TLS verify succeeded",
|
return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want)
|
||||||
"endpoint", endpoint,
|
|
||||||
"fingerprint", got,
|
|
||||||
"attempt", i+1)
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want)
|
c.logger.Info("post-deploy TLS verify succeeded",
|
||||||
|
"endpoint", endpoint,
|
||||||
|
"fingerprint", got)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
return lastErr
|
|
||||||
|
return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe)
|
||||||
}
|
}
|
||||||
|
|
||||||
// restoreFromBackups iterates the WriteResults from a successful per-file
|
// restoreFromBackups iterates the WriteResults from a successful per-file
|
||||||
|
|||||||
@@ -331,3 +331,62 @@ func entryNames(entries []os.DirEntry) []string {
|
|||||||
}
|
}
|
||||||
return names
|
return names
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestEnvoy_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify
|
||||||
|
// retries with exponential backoff (Top-10 fix #8). 4 attempts, 10ms initial,
|
||||||
|
// 80ms cap; expected gaps 10ms, 20ms, 40ms.
|
||||||
|
func TestEnvoy_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
cfg := envoy.Config{
|
||||||
|
CertDir: dir,
|
||||||
|
CertFilename: "cert.pem",
|
||||||
|
KeyFilename: "key.pem",
|
||||||
|
PostDeployVerify: &envoy.PostDeployVerifyConfig{
|
||||||
|
Enabled: true,
|
||||||
|
Endpoint: "envoy.test.invalid:443",
|
||||||
|
Timeout: 100 * time.Millisecond,
|
||||||
|
},
|
||||||
|
PostDeployVerifyAttempts: 4,
|
||||||
|
PostDeployVerifyBackoff: 10 * time.Millisecond,
|
||||||
|
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
|
||||||
|
}
|
||||||
|
c := envoy.New(&cfg, newTestLogger())
|
||||||
|
|
||||||
|
want := certPEMFingerprint(t, certA)
|
||||||
|
var callTimes []time.Time
|
||||||
|
calls := atomic.Int64{}
|
||||||
|
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
|
||||||
|
callTimes = append(callTimes, time.Now())
|
||||||
|
n := calls.Add(1)
|
||||||
|
if n == 4 {
|
||||||
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: want}
|
||||||
|
}
|
||||||
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"}
|
||||||
|
})
|
||||||
|
|
||||||
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||||
|
CertPEM: certA, KeyPEM: keyA,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("DeployCertificate failed: %v", err)
|
||||||
|
}
|
||||||
|
if !res.Success {
|
||||||
|
t.Fatal("expected Success=true")
|
||||||
|
}
|
||||||
|
if len(callTimes) != 4 {
|
||||||
|
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
|
||||||
|
}
|
||||||
|
const tolerance = 25 * time.Millisecond
|
||||||
|
expectedGaps := []time.Duration{
|
||||||
|
10 * time.Millisecond,
|
||||||
|
20 * time.Millisecond,
|
||||||
|
40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
for i := 0; i < len(expectedGaps); i++ {
|
||||||
|
gap := callTimes[i+1].Sub(callTimes[i])
|
||||||
|
expected := expectedGaps[i]
|
||||||
|
if gap < expected-tolerance || gap > expected+tolerance {
|
||||||
|
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -52,9 +52,10 @@ type Config struct {
|
|||||||
PEMFileOwner string `json:"pem_file_owner,omitempty"`
|
PEMFileOwner string `json:"pem_file_owner,omitempty"`
|
||||||
PEMFileGroup string `json:"pem_file_group,omitempty"`
|
PEMFileGroup string `json:"pem_file_group,omitempty"`
|
||||||
|
|
||||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||||
|
PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"`
|
||||||
|
|
||||||
BackupRetention int `json:"backup_retention,omitempty"`
|
BackupRetention int `json:"backup_retention,omitempty"`
|
||||||
}
|
}
|
||||||
@@ -259,38 +260,29 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
||||||
}
|
}
|
||||||
attempts := c.config.PostDeployVerifyAttempts
|
|
||||||
if attempts <= 0 {
|
retryCfg := tlsprobe.RetryConfig{
|
||||||
attempts = 3
|
Attempts: c.config.PostDeployVerifyAttempts,
|
||||||
|
InitialBackoff: c.config.PostDeployVerifyBackoff,
|
||||||
|
MaxBackoff: c.config.PostDeployVerifyMaxBackoff,
|
||||||
}
|
}
|
||||||
backoff := c.config.PostDeployVerifyBackoff
|
|
||||||
if backoff <= 0 {
|
probe := func(probectx context.Context) error {
|
||||||
backoff = 2 * time.Second
|
res := c.probe(probectx, endpoint, timeout)
|
||||||
}
|
|
||||||
var lastErr error
|
|
||||||
for i := 0; i < attempts; i++ {
|
|
||||||
if i > 0 {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-time.After(backoff):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
res := c.probe(ctx, endpoint, timeout)
|
|
||||||
if !res.Success {
|
if !res.Success {
|
||||||
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
|
return fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
got := strings.ToLower(res.Fingerprint)
|
got := strings.ToLower(res.Fingerprint)
|
||||||
want = strings.ToLower(want)
|
wantLower := strings.ToLower(want)
|
||||||
if got == want {
|
if got != wantLower {
|
||||||
c.logger.Info("post-deploy TLS verify succeeded",
|
return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower)
|
||||||
"endpoint", endpoint, "fingerprint", got, "attempt", i+1)
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want)
|
c.logger.Info("post-deploy TLS verify succeeded",
|
||||||
|
"endpoint", endpoint, "fingerprint", got)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
return lastErr
|
|
||||||
|
return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error {
|
func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error {
|
||||||
|
|||||||
@@ -592,3 +592,66 @@ func TestHAProxy_ValidateConfig_RejectsInjection(t *testing.T) {
|
|||||||
t.Error("expected injection error")
|
t.Error("expected injection error")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestHAProxy_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify
|
||||||
|
// retries with exponential backoff.
|
||||||
|
func TestHAProxy_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
pemPath := filepath.Join(dir, "cert.pem")
|
||||||
|
cfg := &haproxy.Config{
|
||||||
|
PEMPath: pemPath,
|
||||||
|
ReloadCommand: "systemctl reload haproxy",
|
||||||
|
PostDeployVerifyAttempts: 4,
|
||||||
|
PostDeployVerifyBackoff: 10 * time.Millisecond,
|
||||||
|
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
|
||||||
|
PostDeployVerify: &haproxy.PostDeployVerifyConfig{
|
||||||
|
Enabled: true,
|
||||||
|
Endpoint: "localhost:443",
|
||||||
|
Timeout: 100 * time.Millisecond,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
c := newC(t, cfg)
|
||||||
|
|
||||||
|
var callTimes []time.Time
|
||||||
|
probeCallCount := atomic.Int32{}
|
||||||
|
|
||||||
|
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
|
||||||
|
callTimes = append(callTimes, time.Now())
|
||||||
|
count := probeCallCount.Add(1)
|
||||||
|
if count == 4 {
|
||||||
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(t, certA)}
|
||||||
|
}
|
||||||
|
return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"}
|
||||||
|
})
|
||||||
|
|
||||||
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||||
|
CertPEM: certA,
|
||||||
|
KeyPEM: keyA,
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("DeployCertificate failed: %v", err)
|
||||||
|
}
|
||||||
|
if !res.Success {
|
||||||
|
t.Fatal("expected Success=true")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(callTimes) != 4 {
|
||||||
|
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
|
||||||
|
}
|
||||||
|
|
||||||
|
const tolerance = 20 * time.Millisecond
|
||||||
|
expectedGaps := []time.Duration{
|
||||||
|
10 * time.Millisecond,
|
||||||
|
20 * time.Millisecond,
|
||||||
|
40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(expectedGaps); i++ {
|
||||||
|
gap := callTimes[i+1].Sub(callTimes[i])
|
||||||
|
expected := expectedGaps[i]
|
||||||
|
if gap < expected-tolerance || gap > expected+tolerance {
|
||||||
|
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -82,9 +82,10 @@ type Config struct {
|
|||||||
KeyFileGroup string `json:"key_file_group,omitempty"`
|
KeyFileGroup string `json:"key_file_group,omitempty"`
|
||||||
|
|
||||||
// Phase 4 (deploy-hardening I): post-deploy TLS verification.
|
// Phase 4 (deploy-hardening I): post-deploy TLS verification.
|
||||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||||
|
PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"`
|
||||||
|
|
||||||
// Phase 4 (deploy-hardening I): backup retention. Zero =
|
// Phase 4 (deploy-hardening I): backup retention. Zero =
|
||||||
// deploy.DefaultBackupRetention (3); -1 = disable backups (no
|
// deploy.DefaultBackupRetention (3); -1 = disable backups (no
|
||||||
@@ -443,41 +444,29 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str
|
|||||||
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
attempts := c.config.PostDeployVerifyAttempts
|
retryCfg := tlsprobe.RetryConfig{
|
||||||
if attempts <= 0 {
|
Attempts: c.config.PostDeployVerifyAttempts,
|
||||||
attempts = 3
|
InitialBackoff: c.config.PostDeployVerifyBackoff,
|
||||||
}
|
MaxBackoff: c.config.PostDeployVerifyMaxBackoff,
|
||||||
backoff := c.config.PostDeployVerifyBackoff
|
|
||||||
if backoff <= 0 {
|
|
||||||
backoff = 2 * time.Second
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var lastErr error
|
probe := func(probectx context.Context) error {
|
||||||
for i := 0; i < attempts; i++ {
|
res := c.probe(probectx, endpoint, timeout)
|
||||||
if i > 0 {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-time.After(backoff):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
res := c.probe(ctx, endpoint, timeout)
|
|
||||||
if !res.Success {
|
if !res.Success {
|
||||||
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
|
return fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
got := strings.ToLower(res.Fingerprint)
|
got := strings.ToLower(res.Fingerprint)
|
||||||
want = strings.ToLower(want)
|
wantLower := strings.ToLower(want)
|
||||||
if got == want {
|
if got != wantLower {
|
||||||
c.logger.Info("post-deploy TLS verify succeeded",
|
return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower)
|
||||||
"endpoint", endpoint,
|
|
||||||
"fingerprint", got,
|
|
||||||
"attempt", i+1)
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want)
|
c.logger.Info("post-deploy TLS verify succeeded",
|
||||||
|
"endpoint", endpoint,
|
||||||
|
"fingerprint", got)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
return lastErr
|
|
||||||
|
return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe)
|
||||||
}
|
}
|
||||||
|
|
||||||
// rollbackToBackups manually triggers a restore by overwriting
|
// rollbackToBackups manually triggers a restore by overwriting
|
||||||
|
|||||||
@@ -1215,3 +1215,66 @@ func TestNginx_Atomic_FreshDeploy_IdempotentFlagFalse(t *testing.T) {
|
|||||||
t.Errorf("idempotent = %q, want false", res.Metadata["idempotent"])
|
t.Errorf("idempotent = %q, want false", res.Metadata["idempotent"])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestNginx_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify
|
||||||
|
// retries with exponential backoff (10ms → 20ms → 40ms up to max).
|
||||||
|
func TestNginx_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
cfg := &nginx.Config{
|
||||||
|
CertPath: filepath.Join(dir, "cert.pem"),
|
||||||
|
ReloadCommand: "nginx -s reload",
|
||||||
|
ValidateCommand: "nginx -t",
|
||||||
|
PostDeployVerifyAttempts: 4,
|
||||||
|
PostDeployVerifyBackoff: 10 * time.Millisecond,
|
||||||
|
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
|
||||||
|
PostDeployVerify: &nginx.PostDeployVerifyConfig{
|
||||||
|
Enabled: true,
|
||||||
|
Endpoint: "localhost:443",
|
||||||
|
Timeout: 100 * time.Millisecond,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
c := newConnectorWithStubs(t, cfg)
|
||||||
|
|
||||||
|
var callTimes []time.Time
|
||||||
|
probeCallCount := atomic.Int32{}
|
||||||
|
|
||||||
|
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
|
||||||
|
callTimes = append(callTimes, time.Now())
|
||||||
|
count := probeCallCount.Add(1)
|
||||||
|
if count == 4 {
|
||||||
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(t, certA)}
|
||||||
|
}
|
||||||
|
return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"}
|
||||||
|
})
|
||||||
|
|
||||||
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||||
|
CertPEM: certA,
|
||||||
|
KeyPEM: keyA,
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("DeployCertificate failed: %v", err)
|
||||||
|
}
|
||||||
|
if !res.Success {
|
||||||
|
t.Fatal("expected Success=true")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(callTimes) != 4 {
|
||||||
|
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
|
||||||
|
}
|
||||||
|
|
||||||
|
const tolerance = 20 * time.Millisecond
|
||||||
|
expectedGaps := []time.Duration{
|
||||||
|
10 * time.Millisecond,
|
||||||
|
20 * time.Millisecond,
|
||||||
|
40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(expectedGaps); i++ {
|
||||||
|
gap := callTimes[i+1].Sub(callTimes[i])
|
||||||
|
expected := expectedGaps[i]
|
||||||
|
if gap < expected-tolerance || gap > expected+tolerance {
|
||||||
|
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -37,17 +37,18 @@ type Config struct {
|
|||||||
ValidateCommand string `json:"validate_command"`
|
ValidateCommand string `json:"validate_command"`
|
||||||
|
|
||||||
// Phase 7: file ownership + mode + verify + retention.
|
// Phase 7: file ownership + mode + verify + retention.
|
||||||
CertFileMode os.FileMode `json:"cert_file_mode,omitempty"`
|
CertFileMode os.FileMode `json:"cert_file_mode,omitempty"`
|
||||||
KeyFileMode os.FileMode `json:"key_file_mode,omitempty"`
|
KeyFileMode os.FileMode `json:"key_file_mode,omitempty"`
|
||||||
ChainFileMode os.FileMode `json:"chain_file_mode,omitempty"`
|
ChainFileMode os.FileMode `json:"chain_file_mode,omitempty"`
|
||||||
CertFileOwner string `json:"cert_file_owner,omitempty"`
|
CertFileOwner string `json:"cert_file_owner,omitempty"`
|
||||||
CertFileGroup string `json:"cert_file_group,omitempty"`
|
CertFileGroup string `json:"cert_file_group,omitempty"`
|
||||||
KeyFileOwner string `json:"key_file_owner,omitempty"`
|
KeyFileOwner string `json:"key_file_owner,omitempty"`
|
||||||
KeyFileGroup string `json:"key_file_group,omitempty"`
|
KeyFileGroup string `json:"key_file_group,omitempty"`
|
||||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||||
BackupRetention int `json:"backup_retention,omitempty"`
|
PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"`
|
||||||
|
BackupRetention int `json:"backup_retention,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type PostDeployVerifyConfig struct {
|
type PostDeployVerifyConfig struct {
|
||||||
@@ -289,36 +290,27 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
||||||
}
|
}
|
||||||
attempts := c.config.PostDeployVerifyAttempts
|
|
||||||
if attempts <= 0 {
|
retryCfg := tlsprobe.RetryConfig{
|
||||||
attempts = 3
|
Attempts: c.config.PostDeployVerifyAttempts,
|
||||||
|
InitialBackoff: c.config.PostDeployVerifyBackoff,
|
||||||
|
MaxBackoff: c.config.PostDeployVerifyMaxBackoff,
|
||||||
}
|
}
|
||||||
backoff := c.config.PostDeployVerifyBackoff
|
|
||||||
if backoff <= 0 {
|
probe := func(probectx context.Context) error {
|
||||||
backoff = 2 * time.Second
|
res := c.probe(probectx, endpoint, timeout)
|
||||||
}
|
|
||||||
var lastErr error
|
|
||||||
for i := 0; i < attempts; i++ {
|
|
||||||
if i > 0 {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-time.After(backoff):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
res := c.probe(ctx, endpoint, timeout)
|
|
||||||
if !res.Success {
|
if !res.Success {
|
||||||
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
|
return fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
got := strings.ToLower(res.Fingerprint)
|
got := strings.ToLower(res.Fingerprint)
|
||||||
want = strings.ToLower(want)
|
wantLower := strings.ToLower(want)
|
||||||
if got == want {
|
if got != wantLower {
|
||||||
return nil
|
return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower)
|
||||||
}
|
}
|
||||||
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want)
|
return nil
|
||||||
}
|
}
|
||||||
return lastErr
|
|
||||||
|
return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error {
|
func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error {
|
||||||
|
|||||||
@@ -430,3 +430,68 @@ func TestPostfix_Atomic_DovecotMode_VerifyFails_Rollback(t *testing.T) {
|
|||||||
t.Errorf("rollback did not restore original key bytes:\n got: %q\n want: %q", gotKey, origKey)
|
t.Errorf("rollback did not restore original key bytes:\n got: %q\n want: %q", gotKey, origKey)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestPostfix_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
c := newC(t, &postfix.Config{
|
||||||
|
Mode: "postfix",
|
||||||
|
CertPath: filepath.Join(dir, "cert.pem"),
|
||||||
|
ReloadCommand: "postfix reload",
|
||||||
|
PostDeployVerifyAttempts: 4,
|
||||||
|
PostDeployVerifyBackoff: 10 * time.Millisecond,
|
||||||
|
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
|
||||||
|
PostDeployVerify: &postfix.PostDeployVerifyConfig{
|
||||||
|
Enabled: true,
|
||||||
|
Endpoint: "localhost:25",
|
||||||
|
Timeout: 100 * time.Millisecond,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
var callTimes []time.Time
|
||||||
|
probeCallCount := atomic.Int32{}
|
||||||
|
|
||||||
|
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
|
||||||
|
callTimes = append(callTimes, time.Now())
|
||||||
|
count := probeCallCount.Add(1)
|
||||||
|
if count == 4 {
|
||||||
|
// Succeed on 4th attempt
|
||||||
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(certA)}
|
||||||
|
}
|
||||||
|
// Fail on attempts 1-3
|
||||||
|
return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"}
|
||||||
|
})
|
||||||
|
|
||||||
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||||
|
CertPEM: certA,
|
||||||
|
KeyPEM: keyA,
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("DeployCertificate failed: %v", err)
|
||||||
|
}
|
||||||
|
if !res.Success {
|
||||||
|
t.Fatal("expected Success=true")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify we made exactly 4 calls
|
||||||
|
if len(callTimes) != 4 {
|
||||||
|
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assert gaps between attempts are approximately 10ms, 20ms, 40ms.
|
||||||
|
// Allow ±20ms tolerance for scheduler noise.
|
||||||
|
const tolerance = 20 * time.Millisecond
|
||||||
|
expectedGaps := []time.Duration{
|
||||||
|
10 * time.Millisecond,
|
||||||
|
20 * time.Millisecond,
|
||||||
|
40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(expectedGaps); i++ {
|
||||||
|
gap := callTimes[i+1].Sub(callTimes[i])
|
||||||
|
expected := expectedGaps[i]
|
||||||
|
if gap < expected-tolerance || gap > expected+tolerance {
|
||||||
|
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -35,16 +35,17 @@ type Config struct {
|
|||||||
|
|
||||||
// Phase 7: per-file mode/owner overrides + post-deploy verify
|
// Phase 7: per-file mode/owner overrides + post-deploy verify
|
||||||
// + backup retention.
|
// + backup retention.
|
||||||
CertFileMode os.FileMode `json:"cert_file_mode,omitempty"`
|
CertFileMode os.FileMode `json:"cert_file_mode,omitempty"`
|
||||||
KeyFileMode os.FileMode `json:"key_file_mode,omitempty"`
|
KeyFileMode os.FileMode `json:"key_file_mode,omitempty"`
|
||||||
CertFileOwner string `json:"cert_file_owner,omitempty"`
|
CertFileOwner string `json:"cert_file_owner,omitempty"`
|
||||||
CertFileGroup string `json:"cert_file_group,omitempty"`
|
CertFileGroup string `json:"cert_file_group,omitempty"`
|
||||||
KeyFileOwner string `json:"key_file_owner,omitempty"`
|
KeyFileOwner string `json:"key_file_owner,omitempty"`
|
||||||
KeyFileGroup string `json:"key_file_group,omitempty"`
|
KeyFileGroup string `json:"key_file_group,omitempty"`
|
||||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||||
BackupRetention int `json:"backup_retention,omitempty"`
|
PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"`
|
||||||
|
BackupRetention int `json:"backup_retention,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type PostDeployVerifyConfig struct {
|
type PostDeployVerifyConfig struct {
|
||||||
@@ -224,34 +225,25 @@ func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM str
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("compute fingerprint: %w", err)
|
return fmt.Errorf("compute fingerprint: %w", err)
|
||||||
}
|
}
|
||||||
attempts := c.config.PostDeployVerifyAttempts
|
|
||||||
if attempts <= 0 {
|
retryCfg := tlsprobe.RetryConfig{
|
||||||
attempts = 3
|
Attempts: c.config.PostDeployVerifyAttempts,
|
||||||
|
InitialBackoff: c.config.PostDeployVerifyBackoff,
|
||||||
|
MaxBackoff: c.config.PostDeployVerifyMaxBackoff,
|
||||||
}
|
}
|
||||||
backoff := c.config.PostDeployVerifyBackoff
|
|
||||||
if backoff <= 0 {
|
probe := func(probectx context.Context) error {
|
||||||
backoff = 2 * time.Second
|
res := c.probe(probectx, verify.Endpoint, timeout)
|
||||||
}
|
|
||||||
var lastErr error
|
|
||||||
for i := 0; i < attempts; i++ {
|
|
||||||
if i > 0 {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-time.After(backoff):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
res := c.probe(ctx, verify.Endpoint, timeout)
|
|
||||||
if !res.Success {
|
if !res.Success {
|
||||||
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
|
return fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
if strings.EqualFold(res.Fingerprint, want) {
|
if !strings.EqualFold(res.Fingerprint, want) {
|
||||||
return nil
|
return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", res.Fingerprint, want)
|
||||||
}
|
}
|
||||||
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", res.Fingerprint, want)
|
return nil
|
||||||
}
|
}
|
||||||
return lastErr
|
|
||||||
|
return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe)
|
||||||
}
|
}
|
||||||
|
|
||||||
// restoreFromBackups iterates the BackupPaths returned by deploy.Apply
|
// restoreFromBackups iterates the BackupPaths returned by deploy.Apply
|
||||||
|
|||||||
@@ -399,3 +399,56 @@ func TestTraefik_Atomic_VerifyMismatch_BothFilesRollBack(t *testing.T) {
|
|||||||
t.Errorf("key not restored on rollback (Bundle 4 regression: pre-fix this would fail because rollbackCertAndKey ignored the key): got %q want %q", string(gotKey), sentinelKey)
|
t.Errorf("key not restored on rollback (Bundle 4 regression: pre-fix this would fail because rollbackCertAndKey ignored the key): got %q want %q", string(gotKey), sentinelKey)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestTraefik_VerifyExponentialBackoff_GrowsBetweenAttempts: post-deploy verify
|
||||||
|
// retries with exponential backoff (Top-10 fix #8). 4 attempts, 10ms initial,
|
||||||
|
// 80ms cap; expected gaps 10ms, 20ms, 40ms.
|
||||||
|
func TestTraefik_VerifyExponentialBackoff_GrowsBetweenAttempts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
c := traefik.New(&traefik.Config{
|
||||||
|
CertDir: dir, CertFile: "cert.pem", KeyFile: "key.pem",
|
||||||
|
PostDeployVerifyAttempts: 4,
|
||||||
|
PostDeployVerifyBackoff: 10 * time.Millisecond,
|
||||||
|
PostDeployVerifyMaxBackoff: 80 * time.Millisecond,
|
||||||
|
PostDeployVerify: &traefik.PostDeployVerifyConfig{
|
||||||
|
Enabled: true, Endpoint: "h:443", Timeout: 100 * time.Millisecond,
|
||||||
|
},
|
||||||
|
}, quietLogger())
|
||||||
|
|
||||||
|
var callTimes []time.Time
|
||||||
|
probeCallCount := atomic.Int32{}
|
||||||
|
c.SetTestProbe(func(_ context.Context, _ string, _ time.Duration) tlsprobe.ProbeResult {
|
||||||
|
callTimes = append(callTimes, time.Now())
|
||||||
|
count := probeCallCount.Add(1)
|
||||||
|
if count == 4 {
|
||||||
|
return tlsprobe.ProbeResult{Success: true, Fingerprint: fingerprintOfPEM(certA)}
|
||||||
|
}
|
||||||
|
return tlsprobe.ProbeResult{Success: false, Error: "cert not yet deployed"}
|
||||||
|
})
|
||||||
|
|
||||||
|
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||||
|
CertPEM: certA, KeyPEM: keyA,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("DeployCertificate failed: %v", err)
|
||||||
|
}
|
||||||
|
if !res.Success {
|
||||||
|
t.Fatal("expected Success=true")
|
||||||
|
}
|
||||||
|
if len(callTimes) != 4 {
|
||||||
|
t.Fatalf("expected 4 probe calls, got %d", len(callTimes))
|
||||||
|
}
|
||||||
|
const tolerance = 25 * time.Millisecond
|
||||||
|
expectedGaps := []time.Duration{
|
||||||
|
10 * time.Millisecond,
|
||||||
|
20 * time.Millisecond,
|
||||||
|
40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
for i := 0; i < len(expectedGaps); i++ {
|
||||||
|
gap := callTimes[i+1].Sub(callTimes[i])
|
||||||
|
expected := expectedGaps[i]
|
||||||
|
if gap < expected-tolerance || gap > expected+tolerance {
|
||||||
|
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
// Copyright (c) 2025 Certctl Contributors <certctl@proton.me>
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: BSL-1.1
|
||||||
|
// See COPYING for license details.
|
||||||
|
|
||||||
|
package tlsprobe
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RetryConfig holds parameters for exponential-backoff retries.
|
||||||
|
// Zero values use defaults: 3 attempts, 1s initial, 16s max.
|
||||||
|
type RetryConfig struct {
|
||||||
|
Attempts int // total attempts; 0 = use 3 default
|
||||||
|
InitialBackoff time.Duration // base; 0 = use 1 * time.Second default
|
||||||
|
MaxBackoff time.Duration // cap; 0 = use 16 * time.Second default
|
||||||
|
}
|
||||||
|
|
||||||
|
// VerifyWithExponentialBackoff calls the probe at most cfg.Attempts times,
|
||||||
|
// waiting cfg.InitialBackoff, 2*InitialBackoff, 4*InitialBackoff, ... capped at
|
||||||
|
// cfg.MaxBackoff between consecutive attempts. Returns nil on first probe success;
|
||||||
|
// returns the last attempt's error on full exhaustion.
|
||||||
|
//
|
||||||
|
// The probe function returns:
|
||||||
|
// - nil error on success → return immediately, no further attempts.
|
||||||
|
// - non-nil error → wait the exponentially-growing backoff and retry.
|
||||||
|
//
|
||||||
|
// The ctx is checked between attempts; ctx cancellation aborts immediately.
|
||||||
|
//
|
||||||
|
// Top-10 fix #8 of the 2026-05-02 deployment-target audit re-run.
|
||||||
|
func VerifyWithExponentialBackoff(ctx context.Context, cfg RetryConfig, probe func(ctx context.Context) error) error {
|
||||||
|
attempts := cfg.Attempts
|
||||||
|
if attempts <= 0 {
|
||||||
|
attempts = 3
|
||||||
|
}
|
||||||
|
initial := cfg.InitialBackoff
|
||||||
|
if initial <= 0 {
|
||||||
|
initial = 1 * time.Second
|
||||||
|
}
|
||||||
|
max := cfg.MaxBackoff
|
||||||
|
if max <= 0 {
|
||||||
|
max = 16 * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
backoff := initial
|
||||||
|
var lastErr error
|
||||||
|
for i := 0; i < attempts; i++ {
|
||||||
|
if i > 0 {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(backoff):
|
||||||
|
}
|
||||||
|
backoff *= 2
|
||||||
|
if backoff > max {
|
||||||
|
backoff = max
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := probe(ctx); err == nil {
|
||||||
|
return nil
|
||||||
|
} else {
|
||||||
|
lastErr = err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return lastErr
|
||||||
|
}
|
||||||
@@ -0,0 +1,129 @@
|
|||||||
|
// Copyright (c) 2025 Certctl Contributors <certctl@proton.me>
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: BSL-1.1
|
||||||
|
// See COPYING for license details.
|
||||||
|
|
||||||
|
package tlsprobe
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestVerifyWithExponentialBackoff_GrowthAndCap(t *testing.T) {
|
||||||
|
cfg := RetryConfig{
|
||||||
|
Attempts: 5,
|
||||||
|
InitialBackoff: 10 * time.Millisecond,
|
||||||
|
MaxBackoff: 40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
var callTimes []time.Time
|
||||||
|
probe := func(ctx context.Context) error {
|
||||||
|
callTimes = append(callTimes, time.Now())
|
||||||
|
return errors.New("always fail")
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
start := time.Now()
|
||||||
|
err := VerifyWithExponentialBackoff(ctx, cfg, probe)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error, got nil")
|
||||||
|
}
|
||||||
|
if len(callTimes) != 5 {
|
||||||
|
t.Fatalf("expected 5 calls, got %d", len(callTimes))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assert gaps between attempts are approximately: 10ms, 20ms, 40ms, 40ms.
|
||||||
|
// Allow ±20ms tolerance for scheduler noise.
|
||||||
|
const tolerance = 20 * time.Millisecond
|
||||||
|
expectedGaps := []time.Duration{
|
||||||
|
10 * time.Millisecond,
|
||||||
|
20 * time.Millisecond,
|
||||||
|
40 * time.Millisecond,
|
||||||
|
40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(expectedGaps); i++ {
|
||||||
|
gap := callTimes[i+1].Sub(callTimes[i])
|
||||||
|
expected := expectedGaps[i]
|
||||||
|
if gap < expected-tolerance || gap > expected+tolerance {
|
||||||
|
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Total wall time should be ~10+20+40+40 = 110ms
|
||||||
|
totalTime := time.Since(start)
|
||||||
|
expectedTotal := 110 * time.Millisecond
|
||||||
|
if totalTime < expectedTotal-50*time.Millisecond || totalTime > expectedTotal+100*time.Millisecond {
|
||||||
|
t.Errorf("total time: expected ~%v, got %v", expectedTotal, totalTime)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestVerifyWithExponentialBackoff_StopsOnFirstSuccess(t *testing.T) {
|
||||||
|
cfg := RetryConfig{
|
||||||
|
Attempts: 3,
|
||||||
|
InitialBackoff: 10 * time.Millisecond,
|
||||||
|
MaxBackoff: 40 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
var callCount int
|
||||||
|
probe := func(ctx context.Context) error {
|
||||||
|
callCount++
|
||||||
|
if callCount == 2 {
|
||||||
|
return nil // success on second attempt
|
||||||
|
}
|
||||||
|
return errors.New("failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
start := time.Now()
|
||||||
|
err := VerifyWithExponentialBackoff(ctx, cfg, probe)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("expected nil, got error: %v", err)
|
||||||
|
}
|
||||||
|
if callCount != 2 {
|
||||||
|
t.Fatalf("expected 2 calls, got %d", callCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Total wall time should be ~10ms (one wait between attempt 1 and 2).
|
||||||
|
totalTime := time.Since(start)
|
||||||
|
const tolerance = 20 * time.Millisecond
|
||||||
|
if totalTime > tolerance {
|
||||||
|
t.Errorf("total time: expected <~20ms, got %v", totalTime)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestVerifyWithExponentialBackoff_CtxCancellation(t *testing.T) {
|
||||||
|
cfg := RetryConfig{
|
||||||
|
Attempts: 5,
|
||||||
|
InitialBackoff: 100 * time.Millisecond,
|
||||||
|
MaxBackoff: 1000 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
var callCount int
|
||||||
|
probe := func(ctx context.Context) error {
|
||||||
|
callCount++
|
||||||
|
return errors.New("always fail")
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
// Cancel after allowing first attempt + partial wait
|
||||||
|
go func() {
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
err := VerifyWithExponentialBackoff(ctx, cfg, probe)
|
||||||
|
|
||||||
|
if !errors.Is(err, context.Canceled) {
|
||||||
|
t.Fatalf("expected context.Canceled, got: %v", err)
|
||||||
|
}
|
||||||
|
// Should have completed first attempt, then been cancelled during wait
|
||||||
|
if callCount != 1 {
|
||||||
|
t.Fatalf("expected 1 call before cancellation, got %d", callCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user