mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 15:11:29 +00:00
711265b652
Phase 1 of the #5 acquisition-readiness fix from the 2026-05-01 issuer coverage audit. Pre-fix, four async-CA connectors (DigiCert, Sectigo, Entrust, GlobalSign) had GetOrderStatus paths that polled the upstream on every scheduler tick with no exponential backoff, no max-retry cap, and no deadline. The scheduler's tick rate (typically 30s) was the only throttle — an unready order got hit every 30s indefinitely, and a 429 from a rate-limited upstream produced "retry on the next tick" which re-fanned-out the same call. This commit ships the shared infrastructure (asyncpoll package) and refactors DigiCert as the reference. Sectigo / Entrust / GlobalSign follow the same mechanical pattern; they land in Phase 2. Phase 1 (this commit): - internal/connector/issuer/asyncpoll/asyncpoll.go: shared Poller with exponential backoff (5s → 15s → 45s → 2m → 5m capped), ±20% jitter, configurable MaxWait deadline (default 10m), and ctx-aware cancellation. - Result enum: StillPending / Done / Failed. PollFunc returns (Result, err); Poll handles the wait loop, deadline check, and ctx propagation. - ErrMaxWait sentinel for callers that want to distinguish "deadline exhausted" from "fn errored". - asyncpoll_test.go: 11 tests covering happy path, transient error keep-polling, Failed terminates immediately, MaxWait timeout, MaxWait+lastErr wrap, ctx cancel, multiplicative backoff, jitter bounds (statistical), pct=0 deterministic, defaults applied. - DigiCert refactor: GetOrderStatus now wraps pollOrderOnce in asyncpoll.Poll. Status-code triage: 2xx + parse + status="issued" → Done with cert 2xx + parse + status="pending" → StillPending 2xx + parse + status="rejected"/"denied" → Done with status="failed" 2xx + parse fail → Failed (permanent) 4xx (not 429) → Failed (404 = order doesn't exist) 429 / 5xx / network → StillPending - Config.PollMaxWaitSeconds (env: CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS) exposes the per-call deadline knob; default 600 (10m). - Test helper buildDigicertConnector + GetOrderStatus_Pending test set PollMaxWaitSeconds=1 so async-pending tests don't block 10 minutes on the production default. Phase 2 (separate follow-up commit, not in this PR): - Sectigo refactor (collectNotReady sentinel maps to StillPending). - Entrust refactor (approval-pending → longer per-issuer MaxWait). - GlobalSign refactor (serial-tracking; same Poller). - Per-connector cadence integration tests against fake HTTP servers. - docs/async-polling.md + docs/connectors.md updates. Audit reference: cowork/issuer-coverage-audit-2026-05-01/RESULTS.md Top-10 fix #5 — Phase 1.
217 lines
7.0 KiB
Go
217 lines
7.0 KiB
Go
// Copyright (c) certctl
|
||
// SPDX-License-Identifier: BSL-1.1
|
||
|
||
// Package asyncpoll provides bounded polling for async-CA issuer
|
||
// connectors (DigiCert, Sectigo, Entrust, GlobalSign).
|
||
//
|
||
// Closes the #5 acquisition-readiness blocker from the 2026-05-01
|
||
// issuer coverage audit. Pre-fix, each async-CA connector had its own
|
||
// GetOrderStatus path that polled the upstream CA on every scheduler
|
||
// tick with no exponential backoff, no max-retry cap, and no deadline.
|
||
// The scheduler's tick rate (typically 30s) was the only throttle —
|
||
// an unready order got hit every 30s indefinitely, and a 429 from a
|
||
// rate-limited upstream produced "retry on the next tick" which
|
||
// re-fanned-out the same call.
|
||
//
|
||
// This package consolidates the four implementations behind a single
|
||
// Poller with:
|
||
//
|
||
// - Exponential backoff: 5s → 15s → 45s → 2m → 5m capped (default).
|
||
// - ±20% jitter at every wait so multiple certctl instances don't
|
||
// synchronize on the upstream CA's rate-limit window.
|
||
// - MaxWait deadline (default 10m) — a hard cap on how long a
|
||
// single Poll call blocks before returning StillPending. The
|
||
// scheduler can re-enqueue the job for a future tick if the
|
||
// operator's policy allows further attempts.
|
||
// - ctx-aware cancellation — propagates the caller's deadline /
|
||
// cancel through every wait.
|
||
//
|
||
// Issuer-specific HTTP request shapes live in the PollFunc closure
|
||
// passed to Poll; the backoff math is shared.
|
||
package asyncpoll
|
||
|
||
import (
|
||
"context"
|
||
"errors"
|
||
"fmt"
|
||
"math/rand/v2"
|
||
"time"
|
||
)
|
||
|
||
// Result is the outcome of one poll attempt.
|
||
type Result int
|
||
|
||
const (
|
||
// StillPending — the upstream is still working on the order;
|
||
// keep polling. The Poller waits, then invokes the PollFunc
|
||
// again (subject to MaxWait).
|
||
StillPending Result = iota
|
||
|
||
// Done — the order succeeded. The Poller returns immediately
|
||
// with the (Done, nil) tuple to the caller.
|
||
Done
|
||
|
||
// Failed — permanent failure (rejected, denied, malformed
|
||
// response). The Poller returns immediately with (Failed, err)
|
||
// to the caller; no further polling.
|
||
Failed
|
||
)
|
||
|
||
// PollFunc runs ONE poll attempt and reports the outcome.
|
||
//
|
||
// Returning (StillPending, nil) signals "transient; keep polling"
|
||
// and the Poller waits with the backoff schedule.
|
||
//
|
||
// Returning (StillPending, err) ALSO keeps polling — useful when
|
||
// the upstream returned a transient HTTP error (5xx, network blip)
|
||
// that the caller wants logged but not treated as fatal.
|
||
//
|
||
// Returning (Done, nil) signals success.
|
||
//
|
||
// Returning (Failed, err) signals permanent failure; err must be
|
||
// non-nil so the caller can include it in the upstream-facing
|
||
// status message.
|
||
type PollFunc func(ctx context.Context) (Result, error)
|
||
|
||
// Config holds the backoff knobs. All fields are optional; zero
|
||
// values fall back to package defaults documented inline.
|
||
type Config struct {
|
||
// MaxWait — hard cap on total wall-clock time inside Poll. After
|
||
// this expires, Poll returns (StillPending, ErrMaxWait). Default
|
||
// 10 minutes; tune via per-issuer Config.PollMaxWait.
|
||
MaxWait time.Duration
|
||
|
||
// InitialWait — first backoff (after the first poll attempt).
|
||
// Default 5 seconds.
|
||
InitialWait time.Duration
|
||
|
||
// MaxBackoff — cap on per-iteration wait. Default 5 minutes.
|
||
// Backoff schedule: InitialWait → 3× → 3× → ... capped at
|
||
// MaxBackoff (so 5s → 15s → 45s → 2m15s → 5m → 5m → ... by
|
||
// default).
|
||
MaxBackoff time.Duration
|
||
|
||
// JitterPct — fractional jitter applied to every wait, ±value.
|
||
// Default 0.2 (i.e., ±20%). Set to 0 for deterministic timing
|
||
// in tests.
|
||
JitterPct float64
|
||
}
|
||
|
||
// ErrMaxWait is returned (alongside StillPending) when the total
|
||
// wall-clock time inside Poll exceeded Config.MaxWait. Callers can
|
||
// errors.Is against this sentinel to distinguish "deadline exhausted"
|
||
// from "fn errored".
|
||
var ErrMaxWait = errors.New("asyncpoll: MaxWait deadline exceeded")
|
||
|
||
// Defaults. Exported so per-issuer tests can reference the same
|
||
// schedule without duplicating constants.
|
||
const (
|
||
DefaultMaxWait = 10 * time.Minute
|
||
DefaultInitialWait = 5 * time.Second
|
||
DefaultMaxBackoff = 5 * time.Minute
|
||
DefaultJitterPct = 0.2
|
||
)
|
||
|
||
// Poll runs fn with exponential backoff + jitter until Done, Failed,
|
||
// MaxWait, or ctx cancellation.
|
||
//
|
||
// On Done — returns (Done, nil). The cert is ready; caller proceeds.
|
||
//
|
||
// On Failed — returns (Failed, fnErr). Permanent; no retry.
|
||
//
|
||
// On MaxWait timeout — returns (StillPending, ErrMaxWait). The
|
||
// upstream isn't done yet but the deadline exhausted. Scheduler
|
||
// can re-enqueue.
|
||
//
|
||
// On ctx cancel — returns (StillPending, ctx.Err()). Caller's
|
||
// deadline / shutdown signal won.
|
||
//
|
||
// On fn returning (StillPending, transientErr) — the err is logged
|
||
// by the closure (not by Poll), and Poll continues with the
|
||
// backoff schedule. The transient err is preserved as the last
|
||
// error in case MaxWait or ctx-cancel later fires.
|
||
func Poll(ctx context.Context, cfg Config, fn PollFunc) (Result, error) {
|
||
if cfg.MaxWait <= 0 {
|
||
cfg.MaxWait = DefaultMaxWait
|
||
}
|
||
if cfg.InitialWait <= 0 {
|
||
cfg.InitialWait = DefaultInitialWait
|
||
}
|
||
if cfg.MaxBackoff <= 0 {
|
||
cfg.MaxBackoff = DefaultMaxBackoff
|
||
}
|
||
if cfg.JitterPct < 0 {
|
||
cfg.JitterPct = 0
|
||
}
|
||
|
||
deadline := time.Now().Add(cfg.MaxWait)
|
||
wait := cfg.InitialWait
|
||
var lastErr error
|
||
|
||
for {
|
||
result, err := fn(ctx)
|
||
switch result {
|
||
case Done:
|
||
return Done, nil
|
||
case Failed:
|
||
return Failed, err
|
||
case StillPending:
|
||
lastErr = err // may be nil (clean keep-polling) or a transient err
|
||
default:
|
||
return Failed, fmt.Errorf("asyncpoll: PollFunc returned unknown Result %d", result)
|
||
}
|
||
|
||
// Compute the next wait with jitter. wait is the cumulative
|
||
// backoff base; jittered is what actually sleeps.
|
||
jittered := jitterDuration(wait, cfg.JitterPct)
|
||
|
||
// If the next wait would push us past the deadline, return
|
||
// StillPending now rather than sleeping uselessly.
|
||
now := time.Now()
|
||
remaining := deadline.Sub(now)
|
||
if remaining <= 0 {
|
||
if lastErr != nil {
|
||
return StillPending, fmt.Errorf("%w (last err: %v)", ErrMaxWait, lastErr)
|
||
}
|
||
return StillPending, ErrMaxWait
|
||
}
|
||
if jittered > remaining {
|
||
jittered = remaining
|
||
}
|
||
|
||
// Sleep, but respect ctx cancellation.
|
||
select {
|
||
case <-ctx.Done():
|
||
if lastErr != nil {
|
||
return StillPending, fmt.Errorf("%w (last err: %v)", ctx.Err(), lastErr)
|
||
}
|
||
return StillPending, ctx.Err()
|
||
case <-time.After(jittered):
|
||
}
|
||
|
||
// Multiplicative backoff (3×) capped at MaxBackoff.
|
||
wait *= 3
|
||
if wait > cfg.MaxBackoff {
|
||
wait = cfg.MaxBackoff
|
||
}
|
||
}
|
||
}
|
||
|
||
// jitterDuration applies ±pct jitter to base. Returned duration is
|
||
// always positive (a base of 0 returns 0 regardless of pct).
|
||
//
|
||
// Visible for testing — the test asserts the bounded envelope rather
|
||
// than the exact value.
|
||
func jitterDuration(base time.Duration, pct float64) time.Duration {
|
||
if base <= 0 || pct <= 0 {
|
||
return base
|
||
}
|
||
// rand/v2's Float64 returns [0, 1); we want [-pct, +pct].
|
||
delta := (rand.Float64()*2 - 1) * pct
|
||
jittered := time.Duration(float64(base) * (1 + delta))
|
||
if jittered < 0 {
|
||
jittered = 0
|
||
}
|
||
return jittered
|
||
}
|