mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 14:11:31 +00:00
I-005: notification retry loop + dead-letter queue
Critical alerts can no longer be silently dropped by a transient
notifier failure. Failed notification attempts now ride an exponential
backoff retry loop, with a 5-attempt budget before promotion to the
dead-letter queue for operator intervention.
Schema (migration 000016, idempotent):
- retry_count INTEGER NOT NULL DEFAULT 0
- next_retry_at TIMESTAMPTZ
- last_error TEXT
- idx_notification_events_retry_sweep partial index
(next_retry_at) WHERE status='failed' AND next_retry_at IS NOT NULL
Dead rows clear next_retry_at so the index stops matching them.
Service contract:
- NotificationService.RetryFailedNotifications drives 2^n-minute
exponential backoff capped at 1h (notifRetryBackoffCap) with
5-attempt budget (notifRetryMaxAttempts).
- Exhaustion (RetryCount >= notifRetryMaxAttempts-1) promotes to
status='dead' via MarkAsDead.
- Non-terminal failures record via RecordFailedAttempt.
- Success path promotes to 'sent' without touching retry_count
(audit preserves "delivered on attempt N").
- Missing-notifier branch defensively promotes to 'sent' to avoid
wedging a row on a deleted channel.
- RequeueNotification operator escape hatch atomically resets
retry_count -> 0, next_retry_at -> NULL, last_error -> NULL,
status -> pending via notifRepo.Requeue.
Scheduler:
- New always-on notificationRetryLoop wired into the base loop set at
CERTCTL_NOTIFICATION_RETRY_INTERVAL (default 2m).
- sync/atomic.Bool idempotency guard.
- sync.WaitGroup shutdown drain via WaitForCompletion.
StatsService:
- SetNotifRepo setter pattern preserves 9 pre-existing
NewStatsService call sites (main.go + stats_test.go + 8 digest
tests) without touching the constructor signature.
- DashboardSummary.NotificationsDead populated via
notifRepo.CountByStatus(ctx, "dead") — nil-safe when unwired
(reports zero on systems without a notification repository).
- CountByStatus error is non-fatal (dashboard summary is
best-effort for this field).
- Prometheus certctl_notification_dead_total counter emitted from
the same snapshot.
Handler:
- New POST /api/v1/notifications/{id}/requeue endpoint.
- dead status surfaces to MCP + CLI.
Frontend:
- NotificationsPage gains two-tab toolbar ("All" / "Dead letter")
with queryKey: ['notifications', activeTab] so switching tabs
doesn't serve stale data until the 30s refetch.
- Dead rows surface "Retry {n}/5" + truncated last_error with
full-text title tooltip.
- Requeue mutation wrapped as
mutationFn: (id: string) => requeueNotification(id)
to prevent react-query v5's positional context argument from
leaking into the API client — pinned against future refactors
by strict-match toHaveBeenCalledWith('notif-dead-001') in
NotificationsPage.test.tsx:181.
Closes I-005.
This commit is contained in:
+52
-28
@@ -12,29 +12,29 @@ import (
|
||||
// Config represents the complete application configuration.
|
||||
// All configuration values are read from environment variables with CERTCTL_ prefix.
|
||||
type Config struct {
|
||||
Server ServerConfig
|
||||
Database DatabaseConfig
|
||||
Scheduler SchedulerConfig
|
||||
Log LogConfig
|
||||
Auth AuthConfig
|
||||
RateLimit RateLimitConfig
|
||||
CORS CORSConfig
|
||||
Keygen KeygenConfig
|
||||
CA CAConfig
|
||||
Notifiers NotifierConfig
|
||||
NetworkScan NetworkScanConfig
|
||||
EST ESTConfig
|
||||
SCEP SCEPConfig
|
||||
Verification VerificationConfig
|
||||
ACME ACMEConfig
|
||||
Vault VaultConfig
|
||||
DigiCert DigiCertConfig
|
||||
Sectigo SectigoConfig
|
||||
GoogleCAS GoogleCASConfig
|
||||
AWSACMPCA AWSACMPCAConfig
|
||||
Entrust EntrustConfig
|
||||
GlobalSign GlobalSignConfig
|
||||
EJBCA EJBCAConfig
|
||||
Server ServerConfig
|
||||
Database DatabaseConfig
|
||||
Scheduler SchedulerConfig
|
||||
Log LogConfig
|
||||
Auth AuthConfig
|
||||
RateLimit RateLimitConfig
|
||||
CORS CORSConfig
|
||||
Keygen KeygenConfig
|
||||
CA CAConfig
|
||||
Notifiers NotifierConfig
|
||||
NetworkScan NetworkScanConfig
|
||||
EST ESTConfig
|
||||
SCEP SCEPConfig
|
||||
Verification VerificationConfig
|
||||
ACME ACMEConfig
|
||||
Vault VaultConfig
|
||||
DigiCert DigiCertConfig
|
||||
Sectigo SectigoConfig
|
||||
GoogleCAS GoogleCASConfig
|
||||
AWSACMPCA AWSACMPCAConfig
|
||||
Entrust EntrustConfig
|
||||
GlobalSign GlobalSignConfig
|
||||
EJBCA EJBCAConfig
|
||||
Digest DigestConfig
|
||||
HealthCheck HealthCheckConfig
|
||||
Encryption EncryptionConfig
|
||||
@@ -708,6 +708,17 @@ type SchedulerConfig struct {
|
||||
// Setting: CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL environment variable.
|
||||
NotificationProcessInterval time.Duration
|
||||
|
||||
// NotificationRetryInterval is how often the scheduler retries failed
|
||||
// notifications whose retry_count is below the service-layer 5-attempt
|
||||
// DLQ budget. Default: 2 minutes. Minimum: 1 second. Mirrors the I-001
|
||||
// RetryInterval knob: transitions eligible Failed notifications whose
|
||||
// next_retry_at has arrived back to Pending so the notification processor
|
||||
// picks them up on its next tick (closes coverage gap I-005 — HEAD had
|
||||
// no retry path for transient SMTP/webhook failures and notifications
|
||||
// stayed Failed forever).
|
||||
// Setting: CERTCTL_NOTIFICATION_RETRY_INTERVAL environment variable.
|
||||
NotificationRetryInterval time.Duration
|
||||
|
||||
// RetryInterval is how often the scheduler retries failed jobs whose Attempts
|
||||
// counter is below MaxAttempts. Default: 5 minutes. Minimum: 1 second.
|
||||
// Transitions eligible Failed jobs back to Pending so the job processor can
|
||||
@@ -838,10 +849,16 @@ func Load() (*Config, error) {
|
||||
JobProcessorInterval: getEnvDuration("CERTCTL_SCHEDULER_JOB_PROCESSOR_INTERVAL", 30*time.Second),
|
||||
AgentHealthCheckInterval: getEnvDuration("CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL", 2*time.Minute),
|
||||
NotificationProcessInterval: getEnvDuration("CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL", 1*time.Minute),
|
||||
RetryInterval: getEnvDuration("CERTCTL_SCHEDULER_RETRY_INTERVAL", 5*time.Minute),
|
||||
JobTimeoutInterval: getEnvDuration("CERTCTL_JOB_TIMEOUT_INTERVAL", 10*time.Minute),
|
||||
AwaitingCSRTimeout: getEnvDuration("CERTCTL_JOB_AWAITING_CSR_TIMEOUT", 24*time.Hour),
|
||||
AwaitingApprovalTimeout: getEnvDuration("CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT", 168*time.Hour),
|
||||
// I-005: retry sweep for failed notifications. Mirrors RetryInterval
|
||||
// (I-001 job retry) but scoped to the notification DLQ machinery.
|
||||
// Default 2 minutes — fast enough to absorb transient SMTP/webhook
|
||||
// blips, slow enough to respect the service-layer 5-attempt budget
|
||||
// without hammering external notifier endpoints.
|
||||
NotificationRetryInterval: getEnvDuration("CERTCTL_NOTIFICATION_RETRY_INTERVAL", 2*time.Minute),
|
||||
RetryInterval: getEnvDuration("CERTCTL_SCHEDULER_RETRY_INTERVAL", 5*time.Minute),
|
||||
JobTimeoutInterval: getEnvDuration("CERTCTL_JOB_TIMEOUT_INTERVAL", 10*time.Minute),
|
||||
AwaitingCSRTimeout: getEnvDuration("CERTCTL_JOB_AWAITING_CSR_TIMEOUT", 24*time.Hour),
|
||||
AwaitingApprovalTimeout: getEnvDuration("CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT", 168*time.Hour),
|
||||
},
|
||||
Log: LogConfig{
|
||||
Level: getEnv("CERTCTL_LOG_LEVEL", "info"),
|
||||
@@ -871,7 +888,7 @@ func Load() (*Config, error) {
|
||||
Notifiers: NotifierConfig{
|
||||
SlackWebhookURL: getEnv("CERTCTL_SLACK_WEBHOOK_URL", ""),
|
||||
SlackChannel: getEnv("CERTCTL_SLACK_CHANNEL", ""),
|
||||
SlackUsername: getEnv("CERTCTL_SLACK_USERNAME", "certctl"),
|
||||
SlackUsername: getEnv("CERTCTL_SLACK_USERNAME", "certctl"),
|
||||
TeamsWebhookURL: getEnv("CERTCTL_TEAMS_WEBHOOK_URL", ""),
|
||||
PagerDutyRoutingKey: getEnv("CERTCTL_PAGERDUTY_ROUTING_KEY", ""),
|
||||
PagerDutySeverity: getEnv("CERTCTL_PAGERDUTY_SEVERITY", "warning"),
|
||||
@@ -1109,6 +1126,13 @@ func (c *Config) Validate() error {
|
||||
return fmt.Errorf("notification process interval must be at least 1 second")
|
||||
}
|
||||
|
||||
// I-005: guard against a misconfigured retry sweep that would either
|
||||
// spin-wait or never fire. Matches the NotificationProcessInterval
|
||||
// minimum (1s) so operators can tune both knobs from the same floor.
|
||||
if c.Scheduler.NotificationRetryInterval < 1*time.Second {
|
||||
return fmt.Errorf("notification retry interval must be at least 1 second")
|
||||
}
|
||||
|
||||
if c.Scheduler.RetryInterval < 1*time.Second {
|
||||
return fmt.Errorf("retry interval must be at least 1 second")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user