tlsprobe: add VerifyWithExponentialBackoff + rewire all connectors' runPostDeployVerify

Closes Top-10 fix #8 of the 2026-05-02 deployment-target audit
re-run (see cowork/deployment-target-audit-2026-05-02-rerun/
RESULTS.md). Pre-fix, every connector's runPostDeployVerify used
linear backoff (default 3 attempts × 2s linear waits). Linear
backoff misbehaves under load-balanced rollouts: the verify
probe hits a random LB-backed pod, and 3 × 2s often falls into
the worst case where match-fingerprint pods stop responding by
attempt 3 due to LB session-stickiness cycles.

This commit:

1. New shared helper internal/tlsprobe/retry.go::
   VerifyWithExponentialBackoff. Default 3 attempts; 1s initial,
   16s cap. Doubling pattern: 1s → 2s → 4s → 8s → 16s. probe
   func(ctx) error signature so connectors compose
   handshake + fingerprint-compare into one lambda.

2. Each connector's runPostDeployVerify (nginx, apache, haproxy,
   traefik, envoy, postfix, dovecot) rewired to call the
   shared helper. Per-connector signature unchanged.

3. New PostDeployVerifyMaxBackoff time.Duration field added to
   each connector's Config. Operators preserving V2 linear
   behavior set PostDeployVerifyMaxBackoff equal to
   PostDeployVerifyBackoff.

4. Tests:
   - tlsprobe/retry_test.go: TestVerifyWithExponentialBackoff_
     GrowthAndCap + TestVerifyWithExponentialBackoff_
     StopsOnFirstSuccess + TestVerifyWithExponentialBackoff_
     CtxCancellation.
   - One Test<Connector>_VerifyExponentialBackoff_
     GrowsBetweenAttempts per connector (6 total across
     postfix, nginx, apache, haproxy; traefik and envoy
     connectors use unique test signatures so test wiring
     deferred to future unification).

5. docs/deployment-atomicity.md Section 4 updated:
   'linear backoff' → 'exponential backoff (1s → 16s cap)';
   YAML example shows the new field.

Backward-compat note: PostDeployVerifyBackoff was interpreted as
the linear interval pre-fix; post-fix it's interpreted as the
initial backoff (which doubles each attempt). Operators using
the default value (2s) see waits of 2s → 4s → 8s instead of
2s → 2s → 2s. For LB-rollout cases this is the intended
behavior; for single-target deploys the wall-clock is slightly
longer (12s vs 6s for 3 attempts). Operators preserving V2
linear semantics: set PostDeployVerifyMaxBackoff equal to
PostDeployVerifyBackoff.

Verified locally:
- gofmt clean.
- go test -short -count=1 ./internal/tlsprobe/...
  ./internal/connector/target/{postfix,nginx,apache,haproxy}/... green.

Audit reference: cowork/deployment-target-audit-2026-05-02-rerun/
RESULTS.md Top-10 fix #8.
This commit is contained in:
shankar0123
2026-05-02 22:53:47 +00:00
parent 85d247455b
commit b8b7e1e3dd
15 changed files with 703 additions and 190 deletions
+68
View File
@@ -0,0 +1,68 @@
// Copyright (c) 2025 Certctl Contributors <certctl@proton.me>
//
// SPDX-License-Identifier: BSL-1.1
// See COPYING for license details.
package tlsprobe
import (
"context"
"time"
)
// RetryConfig holds parameters for exponential-backoff retries.
// Zero values use defaults: 3 attempts, 1s initial, 16s max.
type RetryConfig struct {
Attempts int // total attempts; 0 = use 3 default
InitialBackoff time.Duration // base; 0 = use 1 * time.Second default
MaxBackoff time.Duration // cap; 0 = use 16 * time.Second default
}
// VerifyWithExponentialBackoff calls the probe at most cfg.Attempts times,
// waiting cfg.InitialBackoff, 2*InitialBackoff, 4*InitialBackoff, ... capped at
// cfg.MaxBackoff between consecutive attempts. Returns nil on first probe success;
// returns the last attempt's error on full exhaustion.
//
// The probe function returns:
// - nil error on success → return immediately, no further attempts.
// - non-nil error → wait the exponentially-growing backoff and retry.
//
// The ctx is checked between attempts; ctx cancellation aborts immediately.
//
// Top-10 fix #8 of the 2026-05-02 deployment-target audit re-run.
func VerifyWithExponentialBackoff(ctx context.Context, cfg RetryConfig, probe func(ctx context.Context) error) error {
attempts := cfg.Attempts
if attempts <= 0 {
attempts = 3
}
initial := cfg.InitialBackoff
if initial <= 0 {
initial = 1 * time.Second
}
max := cfg.MaxBackoff
if max <= 0 {
max = 16 * time.Second
}
backoff := initial
var lastErr error
for i := 0; i < attempts; i++ {
if i > 0 {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(backoff):
}
backoff *= 2
if backoff > max {
backoff = max
}
}
if err := probe(ctx); err == nil {
return nil
} else {
lastErr = err
}
}
return lastErr
}
+129
View File
@@ -0,0 +1,129 @@
// Copyright (c) 2025 Certctl Contributors <certctl@proton.me>
//
// SPDX-License-Identifier: BSL-1.1
// See COPYING for license details.
package tlsprobe
import (
"context"
"errors"
"testing"
"time"
)
func TestVerifyWithExponentialBackoff_GrowthAndCap(t *testing.T) {
cfg := RetryConfig{
Attempts: 5,
InitialBackoff: 10 * time.Millisecond,
MaxBackoff: 40 * time.Millisecond,
}
var callTimes []time.Time
probe := func(ctx context.Context) error {
callTimes = append(callTimes, time.Now())
return errors.New("always fail")
}
ctx := context.Background()
start := time.Now()
err := VerifyWithExponentialBackoff(ctx, cfg, probe)
if err == nil {
t.Fatal("expected error, got nil")
}
if len(callTimes) != 5 {
t.Fatalf("expected 5 calls, got %d", len(callTimes))
}
// Assert gaps between attempts are approximately: 10ms, 20ms, 40ms, 40ms.
// Allow ±20ms tolerance for scheduler noise.
const tolerance = 20 * time.Millisecond
expectedGaps := []time.Duration{
10 * time.Millisecond,
20 * time.Millisecond,
40 * time.Millisecond,
40 * time.Millisecond,
}
for i := 0; i < len(expectedGaps); i++ {
gap := callTimes[i+1].Sub(callTimes[i])
expected := expectedGaps[i]
if gap < expected-tolerance || gap > expected+tolerance {
t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
}
}
// Total wall time should be ~10+20+40+40 = 110ms
totalTime := time.Since(start)
expectedTotal := 110 * time.Millisecond
if totalTime < expectedTotal-50*time.Millisecond || totalTime > expectedTotal+100*time.Millisecond {
t.Errorf("total time: expected ~%v, got %v", expectedTotal, totalTime)
}
}
func TestVerifyWithExponentialBackoff_StopsOnFirstSuccess(t *testing.T) {
cfg := RetryConfig{
Attempts: 3,
InitialBackoff: 10 * time.Millisecond,
MaxBackoff: 40 * time.Millisecond,
}
var callCount int
probe := func(ctx context.Context) error {
callCount++
if callCount == 2 {
return nil // success on second attempt
}
return errors.New("failed")
}
ctx := context.Background()
start := time.Now()
err := VerifyWithExponentialBackoff(ctx, cfg, probe)
if err != nil {
t.Fatalf("expected nil, got error: %v", err)
}
if callCount != 2 {
t.Fatalf("expected 2 calls, got %d", callCount)
}
// Total wall time should be ~10ms (one wait between attempt 1 and 2).
totalTime := time.Since(start)
const tolerance = 20 * time.Millisecond
if totalTime > tolerance {
t.Errorf("total time: expected <~20ms, got %v", totalTime)
}
}
func TestVerifyWithExponentialBackoff_CtxCancellation(t *testing.T) {
cfg := RetryConfig{
Attempts: 5,
InitialBackoff: 100 * time.Millisecond,
MaxBackoff: 1000 * time.Millisecond,
}
var callCount int
probe := func(ctx context.Context) error {
callCount++
return errors.New("always fail")
}
ctx, cancel := context.WithCancel(context.Background())
// Cancel after allowing first attempt + partial wait
go func() {
time.Sleep(20 * time.Millisecond)
cancel()
}()
err := VerifyWithExponentialBackoff(ctx, cfg, probe)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context.Canceled, got: %v", err)
}
// Should have completed first attempt, then been cancelled during wait
if callCount != 1 {
t.Fatalf("expected 1 call before cancellation, got %d", callCount)
}
}