asyncpoll: shared bounded-polling Poller + DigiCert refactor (Phase 1)

Phase 1 of the #5 acquisition-readiness fix from the 2026-05-01 issuer
coverage audit. Pre-fix, four async-CA connectors (DigiCert, Sectigo,
Entrust, GlobalSign) had GetOrderStatus paths that polled the upstream
on every scheduler tick with no exponential backoff, no max-retry cap,
and no deadline. The scheduler's tick rate (typically 30s) was the
only throttle — an unready order got hit every 30s indefinitely, and
a 429 from a rate-limited upstream produced "retry on the next tick"
which re-fanned-out the same call.

This commit ships the shared infrastructure (asyncpoll package) and
refactors DigiCert as the reference. Sectigo / Entrust / GlobalSign
follow the same mechanical pattern; they land in Phase 2.

Phase 1 (this commit):
- internal/connector/issuer/asyncpoll/asyncpoll.go: shared Poller
  with exponential backoff (5s → 15s → 45s → 2m → 5m capped),
  ±20% jitter, configurable MaxWait deadline (default 10m), and
  ctx-aware cancellation.
- Result enum: StillPending / Done / Failed. PollFunc returns
  (Result, err); Poll handles the wait loop, deadline check, and
  ctx propagation.
- ErrMaxWait sentinel for callers that want to distinguish
  "deadline exhausted" from "fn errored".
- asyncpoll_test.go: 11 tests covering happy path, transient error
  keep-polling, Failed terminates immediately, MaxWait timeout,
  MaxWait+lastErr wrap, ctx cancel, multiplicative backoff, jitter
  bounds (statistical), pct=0 deterministic, defaults applied.
- DigiCert refactor: GetOrderStatus now wraps pollOrderOnce in
  asyncpoll.Poll. Status-code triage:
    2xx + parse + status="issued"           → Done with cert
    2xx + parse + status="pending"          → StillPending
    2xx + parse + status="rejected"/"denied" → Done with status="failed"
    2xx + parse fail                        → Failed (permanent)
    4xx (not 429)                           → Failed (404 = order
                                              doesn't exist)
    429 / 5xx / network                     → StillPending
- Config.PollMaxWaitSeconds (env: CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS)
  exposes the per-call deadline knob; default 600 (10m).
- Test helper buildDigicertConnector + GetOrderStatus_Pending test
  set PollMaxWaitSeconds=1 so async-pending tests don't block 10
  minutes on the production default.

Phase 2 (separate follow-up commit, not in this PR):
- Sectigo refactor (collectNotReady sentinel maps to StillPending).
- Entrust refactor (approval-pending → longer per-issuer MaxWait).
- GlobalSign refactor (serial-tracking; same Poller).
- Per-connector cadence integration tests against fake HTTP servers.
- docs/async-polling.md + docs/connectors.md updates.

Audit reference: cowork/issuer-coverage-audit-2026-05-01/RESULTS.md
Top-10 fix #5 — Phase 1.
This commit is contained in:
shankar0123
2026-05-02 02:18:50 +00:00
parent 74d6b462a4
commit 711265b652
5 changed files with 613 additions and 25 deletions
+112 -21
View File
@@ -38,6 +38,7 @@ import (
"time"
"github.com/shankar0123/certctl/internal/connector/issuer"
"github.com/shankar0123/certctl/internal/connector/issuer/asyncpoll"
)
// Config represents the DigiCert CertCentral issuer connector configuration.
@@ -59,6 +60,25 @@ type Config struct {
// Default: "https://www.digicert.com/services/v2".
// Set via CERTCTL_DIGICERT_BASE_URL environment variable.
BaseURL string `json:"base_url"`
// PollMaxWaitSeconds caps how long GetOrderStatus blocks doing
// internal exponential-backoff polling before returning
// StillPending to the caller. Default 600 (10 minutes); 0 falls
// back to the asyncpoll package default. Bound only on the
// per-call wall-clock; the caller (scheduler) can re-invoke on
// the next tick if its policy allows.
//
// Set via CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS. Audit fix #5.
PollMaxWaitSeconds int `json:"poll_max_wait_seconds,omitempty"`
}
// pollMaxWait returns the configured PollMaxWait as a time.Duration,
// or the asyncpoll package default if unset.
func (c *Config) pollMaxWait() time.Duration {
if c.PollMaxWaitSeconds <= 0 {
return asyncpoll.DefaultMaxWait
}
return time.Duration(c.PollMaxWaitSeconds) * time.Second
}
// Connector implements the issuer.Connector interface for DigiCert CertCentral.
@@ -328,57 +348,124 @@ func (c *Connector) RevokeCertificate(ctx context.Context, request issuer.Revoca
return nil
}
// GetOrderStatus checks the status of a DigiCert certificate order.
// If the order is "issued", downloads the certificate and returns it.
// If still "pending", returns pending status for continued polling.
// GetOrderStatus checks the status of a DigiCert certificate order
// using bounded internal polling (asyncpoll.Poll). One call blocks
// for up to PollMaxWait (default 10m) doing exponential backoff;
// returns Done with the cert, Failed with the rejection reason, or
// StillPending if the deadline expires (caller can re-invoke).
//
// Audit fix #5: previously this method made one HTTP call per
// scheduler tick. Under load that pile-drives the upstream rate
// limit. asyncpoll wraps the one-shot logic with backoff + jitter.
func (c *Connector) GetOrderStatus(ctx context.Context, orderID string) (*issuer.OrderStatus, error) {
c.logger.Debug("checking DigiCert order status", "order_id", orderID)
// Closure-scoped accumulators — Poll passes back only the Result;
// the cert / pending message land here for the wrapper to return.
var done *issuer.OrderStatus
var lastPendingMsg string
cfg := asyncpoll.Config{MaxWait: c.config.pollMaxWait()}
res, err := asyncpoll.Poll(ctx, cfg, func(ctx context.Context) (asyncpoll.Result, error) {
status, result, pollErr := c.pollOrderOnce(ctx, orderID)
if status != nil {
switch result {
case asyncpoll.Done:
done = status
case asyncpoll.StillPending:
if status.Message != nil {
lastPendingMsg = *status.Message
}
}
}
return result, pollErr
})
now := time.Now()
switch res {
case asyncpoll.Done:
return done, nil
case asyncpoll.Failed:
// Permanent failure — surface the upstream's error to the
// caller so handler middleware / scheduler can mark the job
// failed with the actual reason.
return nil, err
default: // StillPending — MaxWait or ctx cancel
msg := lastPendingMsg
if msg == "" {
msg = fmt.Sprintf("order %s still pending after PollMaxWait", orderID)
}
return &issuer.OrderStatus{
OrderID: orderID,
Status: "pending",
Message: &msg,
UpdatedAt: now,
}, nil
}
}
// pollOrderOnce makes one HTTP GET against the DigiCert order-status
// endpoint and translates the response into an asyncpoll.Result plus
// (when applicable) a populated OrderStatus. Used by GetOrderStatus
// as the per-iteration closure for asyncpoll.Poll.
func (c *Connector) pollOrderOnce(ctx context.Context, orderID string) (*issuer.OrderStatus, asyncpoll.Result, error) {
statusURL := fmt.Sprintf("%s/order/certificate/%s", c.config.BaseURL, orderID)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, statusURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to create status request: %w", err)
return nil, asyncpoll.Failed, fmt.Errorf("failed to create status request: %w", err)
}
req.Header.Set("X-DC-DEVKEY", c.config.APIKey)
req.Header.Set("Content-Type", "application/json")
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("DigiCert status request failed: %w", err)
// Transient network error — keep polling. Caller's MaxWait
// will eventually fire if it persists.
return nil, asyncpoll.StillPending, fmt.Errorf("DigiCert status request failed: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read status response: %w", err)
return nil, asyncpoll.StillPending, fmt.Errorf("failed to read status response: %w", err)
}
// Status-code triage:
// 2xx → fall through to body parse below.
// 429 → StillPending (rate limited; retry with backoff).
// 5xx → StillPending (upstream unhealthy; transient).
// other 4xx → Failed (permanent client error: 400 bad
// request, 401 auth, 403 forbidden, 404 order
// doesn't exist). No amount of polling fixes
// these.
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("DigiCert order status returned %d: %s", resp.StatusCode, string(respBody))
err := fmt.Errorf("DigiCert order status returned %d: %s", resp.StatusCode, string(respBody))
if resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 {
return nil, asyncpoll.StillPending, err
}
return nil, asyncpoll.Failed, err
}
var statusResp orderStatusResponse
if err := json.Unmarshal(respBody, &statusResp); err != nil {
return nil, fmt.Errorf("failed to parse status response: %w", err)
// Parse errors are permanent — the upstream's response shape
// changed or the body is corrupted. Retrying produces the
// same parse error.
return nil, asyncpoll.Failed, fmt.Errorf("failed to parse status response: %w", err)
}
now := time.Now()
switch statusResp.Status {
case "issued":
if statusResp.Certificate.ID == 0 {
return nil, fmt.Errorf("order is issued but certificate_id is missing")
return nil, asyncpoll.Failed, fmt.Errorf("order is issued but certificate_id is missing")
}
certPEM, chainPEM, serial, notBefore, notAfter, err := c.downloadCertificate(ctx, statusResp.Certificate.ID)
if err != nil {
return nil, fmt.Errorf("failed to download certificate: %w", err)
return nil, asyncpoll.Failed, fmt.Errorf("failed to download certificate: %w", err)
}
c.logger.Info("DigiCert order completed",
"order_id", orderID,
"serial", serial)
c.logger.Info("DigiCert order completed", "order_id", orderID, "serial", serial)
return &issuer.OrderStatus{
OrderID: orderID,
Status: "completed",
@@ -388,7 +475,7 @@ func (c *Connector) GetOrderStatus(ctx context.Context, orderID string) (*issuer
NotBefore: &notBefore,
NotAfter: &notAfter,
UpdatedAt: now,
}, nil
}, asyncpoll.Done, nil
case "pending", "processing":
msg := fmt.Sprintf("order %s is %s", orderID, statusResp.Status)
@@ -397,16 +484,20 @@ func (c *Connector) GetOrderStatus(ctx context.Context, orderID string) (*issuer
Status: "pending",
Message: &msg,
UpdatedAt: now,
}, nil
}, asyncpoll.StillPending, nil
case "rejected", "denied":
// Completed-with-negative-answer. NOT a transient failure
// (the order won't un-reject itself), but also not a
// caller-facing Go error — wrap in OrderStatus{Status:"failed"}
// so the scheduler sees a definitive completion.
msg := fmt.Sprintf("order %s was %s", orderID, statusResp.Status)
return &issuer.OrderStatus{
OrderID: orderID,
Status: "failed",
Message: &msg,
UpdatedAt: now,
}, nil
}, asyncpoll.Done, nil
default:
msg := fmt.Sprintf("unknown order status: %s", statusResp.Status)
@@ -415,7 +506,7 @@ func (c *Connector) GetOrderStatus(ctx context.Context, orderID string) (*issuer
Status: "pending",
Message: &msg,
UpdatedAt: now,
}, nil
}, asyncpoll.StillPending, nil
}
}
@@ -19,7 +19,11 @@ import (
func buildDigicertConnector(t *testing.T, baseURL string) *digicert.Connector {
t.Helper()
c := digicert.New(nil, slog.Default())
cfg := digicert.Config{APIKey: "k", OrgID: "1", ProductType: "ssl_basic", BaseURL: baseURL}
// PollMaxWaitSeconds=1 keeps async-pending tests fast — pending
// status returns within ~1s instead of the 10-minute production
// default. Tests that exercise issued/failed/parse-error paths
// don't block on the wait.
cfg := digicert.Config{APIKey: "k", OrgID: "1", ProductType: "ssl_basic", BaseURL: baseURL, PollMaxWaitSeconds: 1}
raw, _ := json.Marshal(cfg)
if err := c.ValidateConfig(context.Background(), raw); err != nil {
t.Fatalf("ValidateConfig: %v", err)
@@ -289,9 +289,10 @@ func TestDigiCertConnector(t *testing.T) {
defer srv.Close()
config := &digicert.Config{
APIKey: "dc-test-key",
OrgID: "12345",
BaseURL: srv.URL,
APIKey: "dc-test-key",
OrgID: "12345",
BaseURL: srv.URL,
PollMaxWaitSeconds: 1, // keep async-pending tests fast
}
connector := digicert.New(config, logger)