tlsprobe: add VerifyWithExponentialBackoff + rewire all connectors' runPostDeployVerify

Closes Top-10 fix #8 of the 2026-05-02 deployment-target audit re-run (see cowork/deployment-target-audit-2026-05-02-rerun/ RESULTS.md). Pre-fix, every connector's runPostDeployVerify used linear backoff (default 3 attempts × 2s linear waits). Linear backoff misbehaves under load-balanced rollouts: the verify probe hits a random LB-backed pod, and 3 × 2s often falls into the worst case where match-fingerprint pods stop responding by attempt 3 due to LB session-stickiness cycles. This commit: 1. New shared helper internal/tlsprobe/retry.go:: VerifyWithExponentialBackoff. Default 3 attempts; 1s initial, 16s cap. Doubling pattern: 1s → 2s → 4s → 8s → 16s. probe func(ctx) error signature so connectors compose handshake + fingerprint-compare into one lambda. 2. Each connector's runPostDeployVerify (nginx, apache, haproxy, traefik, envoy, postfix, dovecot) rewired to call the shared helper. Per-connector signature unchanged. 3. New PostDeployVerifyMaxBackoff time.Duration field added to each connector's Config. Operators preserving V2 linear behavior set PostDeployVerifyMaxBackoff equal to PostDeployVerifyBackoff. 4. Tests: - tlsprobe/retry_test.go: TestVerifyWithExponentialBackoff_ GrowthAndCap + TestVerifyWithExponentialBackoff_ StopsOnFirstSuccess + TestVerifyWithExponentialBackoff_ CtxCancellation. - One Test<Connector>_VerifyExponentialBackoff_ GrowsBetweenAttempts per connector (6 total across postfix, nginx, apache, haproxy; traefik and envoy connectors use unique test signatures so test wiring deferred to future unification). 5. docs/deployment-atomicity.md Section 4 updated: 'linear backoff' → 'exponential backoff (1s → 16s cap)'; YAML example shows the new field. Backward-compat note: PostDeployVerifyBackoff was interpreted as the linear interval pre-fix; post-fix it's interpreted as the initial backoff (which doubles each attempt). Operators using the default value (2s) see waits of 2s → 4s → 8s instead of 2s → 2s → 2s. For LB-rollout cases this is the intended behavior; for single-target deploys the wall-clock is slightly longer (12s vs 6s for 3 attempts). Operators preserving V2 linear semantics: set PostDeployVerifyMaxBackoff equal to PostDeployVerifyBackoff. Verified locally: - gofmt clean. - go test -short -count=1 ./internal/tlsprobe/... ./internal/connector/target/{postfix,nginx,apache,haproxy}/... green. Audit reference: cowork/deployment-target-audit-2026-05-02-rerun/ RESULTS.md Top-10 fix #8.
2026-06-07 15:01:32 +00:00 · 2026-05-02 22:53:47 +00:00
parent 85d247455b
commit b8b7e1e3dd
15 changed files with 703 additions and 190 deletions
@@ -0,0 +1,68 @@
+// Copyright (c) 2025 Certctl Contributors <certctl@proton.me>
+//
+// SPDX-License-Identifier: BSL-1.1
+// See COPYING for license details.
+
+package tlsprobe
+
+import (
+	"context"
+	"time"
+)
+
+// RetryConfig holds parameters for exponential-backoff retries.
+// Zero values use defaults: 3 attempts, 1s initial, 16s max.
+type RetryConfig struct {
+	Attempts       int           // total attempts; 0 = use 3 default
+	InitialBackoff time.Duration // base; 0 = use 1 * time.Second default
+	MaxBackoff     time.Duration // cap; 0 = use 16 * time.Second default
+}
+
+// VerifyWithExponentialBackoff calls the probe at most cfg.Attempts times,
+// waiting cfg.InitialBackoff, 2*InitialBackoff, 4*InitialBackoff, ... capped at
+// cfg.MaxBackoff between consecutive attempts. Returns nil on first probe success;
+// returns the last attempt's error on full exhaustion.
+//
+// The probe function returns:
+//   - nil error on success → return immediately, no further attempts.
+//   - non-nil error → wait the exponentially-growing backoff and retry.
+//
+// The ctx is checked between attempts; ctx cancellation aborts immediately.
+//
+// Top-10 fix #8 of the 2026-05-02 deployment-target audit re-run.
+func VerifyWithExponentialBackoff(ctx context.Context, cfg RetryConfig, probe func(ctx context.Context) error) error {
+	attempts := cfg.Attempts
+	if attempts <= 0 {
+		attempts = 3
+	}
+	initial := cfg.InitialBackoff
+	if initial <= 0 {
+		initial = 1 * time.Second
+	}
+	max := cfg.MaxBackoff
+	if max <= 0 {
+		max = 16 * time.Second
+	}
+
+	backoff := initial
+	var lastErr error
+	for i := 0; i < attempts; i++ {
+		if i > 0 {
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-time.After(backoff):
+			}
+			backoff *= 2
+			if backoff > max {
+				backoff = max
+			}
+		}
+		if err := probe(ctx); err == nil {
+			return nil
+		} else {
+			lastErr = err
+		}
+	}
+	return lastErr
+}
@@ -0,0 +1,129 @@
+// Copyright (c) 2025 Certctl Contributors <certctl@proton.me>
+//
+// SPDX-License-Identifier: BSL-1.1
+// See COPYING for license details.
+
+package tlsprobe
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+)
+
+func TestVerifyWithExponentialBackoff_GrowthAndCap(t *testing.T) {
+	cfg := RetryConfig{
+		Attempts:       5,
+		InitialBackoff: 10 * time.Millisecond,
+		MaxBackoff:     40 * time.Millisecond,
+	}
+
+	var callTimes []time.Time
+	probe := func(ctx context.Context) error {
+		callTimes = append(callTimes, time.Now())
+		return errors.New("always fail")
+	}
+
+	ctx := context.Background()
+	start := time.Now()
+	err := VerifyWithExponentialBackoff(ctx, cfg, probe)
+
+	if err == nil {
+		t.Fatal("expected error, got nil")
+	}
+	if len(callTimes) != 5 {
+		t.Fatalf("expected 5 calls, got %d", len(callTimes))
+	}
+
+	// Assert gaps between attempts are approximately: 10ms, 20ms, 40ms, 40ms.
+	// Allow ±20ms tolerance for scheduler noise.
+	const tolerance = 20 * time.Millisecond
+	expectedGaps := []time.Duration{
+		10 * time.Millisecond,
+		20 * time.Millisecond,
+		40 * time.Millisecond,
+		40 * time.Millisecond,
+	}
+
+	for i := 0; i < len(expectedGaps); i++ {
+		gap := callTimes[i+1].Sub(callTimes[i])
+		expected := expectedGaps[i]
+		if gap < expected-tolerance || gap > expected+tolerance {
+			t.Errorf("gap[%d]: expected ~%v, got %v", i, expected, gap)
+		}
+	}
+
+	// Total wall time should be ~10+20+40+40 = 110ms
+	totalTime := time.Since(start)
+	expectedTotal := 110 * time.Millisecond
+	if totalTime < expectedTotal-50*time.Millisecond || totalTime > expectedTotal+100*time.Millisecond {
+		t.Errorf("total time: expected ~%v, got %v", expectedTotal, totalTime)
+	}
+}
+
+func TestVerifyWithExponentialBackoff_StopsOnFirstSuccess(t *testing.T) {
+	cfg := RetryConfig{
+		Attempts:       3,
+		InitialBackoff: 10 * time.Millisecond,
+		MaxBackoff:     40 * time.Millisecond,
+	}
+
+	var callCount int
+	probe := func(ctx context.Context) error {
+		callCount++
+		if callCount == 2 {
+			return nil // success on second attempt
+		}
+		return errors.New("failed")
+	}
+
+	ctx := context.Background()
+	start := time.Now()
+	err := VerifyWithExponentialBackoff(ctx, cfg, probe)
+
+	if err != nil {
+		t.Fatalf("expected nil, got error: %v", err)
+	}
+	if callCount != 2 {
+		t.Fatalf("expected 2 calls, got %d", callCount)
+	}
+
+	// Total wall time should be ~10ms (one wait between attempt 1 and 2).
+	totalTime := time.Since(start)
+	const tolerance = 20 * time.Millisecond
+	if totalTime > tolerance {
+		t.Errorf("total time: expected <~20ms, got %v", totalTime)
+	}
+}
+
+func TestVerifyWithExponentialBackoff_CtxCancellation(t *testing.T) {
+	cfg := RetryConfig{
+		Attempts:       5,
+		InitialBackoff: 100 * time.Millisecond,
+		MaxBackoff:     1000 * time.Millisecond,
+	}
+
+	var callCount int
+	probe := func(ctx context.Context) error {
+		callCount++
+		return errors.New("always fail")
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	// Cancel after allowing first attempt + partial wait
+	go func() {
+		time.Sleep(20 * time.Millisecond)
+		cancel()
+	}()
+
+	err := VerifyWithExponentialBackoff(ctx, cfg, probe)
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("expected context.Canceled, got: %v", err)
+	}
+	// Should have completed first attempt, then been cancelled during wait
+	if callCount != 1 {
+		t.Fatalf("expected 1 call before cancellation, got %d", callCount)
+	}
+}