I-005: notification retry loop + dead-letter queue

Critical alerts can no longer be silently dropped by a transient notifier failure. Failed notification attempts now ride an exponential backoff retry loop, with a 5-attempt budget before promotion to the dead-letter queue for operator intervention. Schema (migration 000016, idempotent): - retry_count INTEGER NOT NULL DEFAULT 0 - next_retry_at TIMESTAMPTZ - last_error TEXT - idx_notification_events_retry_sweep partial index (next_retry_at) WHERE status='failed' AND next_retry_at IS NOT NULL Dead rows clear next_retry_at so the index stops matching them. Service contract: - NotificationService.RetryFailedNotifications drives 2^n-minute exponential backoff capped at 1h (notifRetryBackoffCap) with 5-attempt budget (notifRetryMaxAttempts). - Exhaustion (RetryCount >= notifRetryMaxAttempts-1) promotes to status='dead' via MarkAsDead. - Non-terminal failures record via RecordFailedAttempt. - Success path promotes to 'sent' without touching retry_count (audit preserves "delivered on attempt N"). - Missing-notifier branch defensively promotes to 'sent' to avoid wedging a row on a deleted channel. - RequeueNotification operator escape hatch atomically resets retry_count -> 0, next_retry_at -> NULL, last_error -> NULL, status -> pending via notifRepo.Requeue. Scheduler: - New always-on notificationRetryLoop wired into the base loop set at CERTCTL_NOTIFICATION_RETRY_INTERVAL (default 2m). - sync/atomic.Bool idempotency guard. - sync.WaitGroup shutdown drain via WaitForCompletion. StatsService: - SetNotifRepo setter pattern preserves 9 pre-existing NewStatsService call sites (main.go + stats_test.go + 8 digest tests) without touching the constructor signature. - DashboardSummary.NotificationsDead populated via notifRepo.CountByStatus(ctx, "dead") — nil-safe when unwired (reports zero on systems without a notification repository). - CountByStatus error is non-fatal (dashboard summary is best-effort for this field). - Prometheus certctl_notification_dead_total counter emitted from the same snapshot. Handler: - New POST /api/v1/notifications/{id}/requeue endpoint. - dead status surfaces to MCP + CLI. Frontend: - NotificationsPage gains two-tab toolbar ("All" / "Dead letter") with queryKey: ['notifications', activeTab] so switching tabs doesn't serve stale data until the 30s refetch. - Dead rows surface "Retry {n}/5" + truncated last_error with full-text title tooltip. - Requeue mutation wrapped as mutationFn: (id: string) => requeueNotification(id) to prevent react-query v5's positional context argument from leaking into the API client — pinned against future refactors by strict-match toHaveBeenCalledWith('notif-dead-001') in NotificationsPage.test.tsx:181. Closes I-005.
2026-06-13 16:38:51 +00:00 · 2026-04-19 15:17:27 +00:00
parent 707d8de4fb
commit 675b87ba63
33 changed files with 3758 additions and 228 deletions
@@ -34,8 +34,14 @@ type AgentServicer interface {
 }

 // NotificationServicer defines the interface for notification processing used by the scheduler.
+//
+// RetryFailedNotifications was added to close coverage gap I-005: the retry
+// sweep transitions eligible Failed notifications to Pending on an independent
+// tick, using exponential backoff with a 1h cap and a 5-attempt DLQ budget.
+// Mirrors the I-001 job retry loop topology.
 type NotificationServicer interface {
 	ProcessPendingNotifications(ctx context.Context) error
+	RetryFailedNotifications(ctx context.Context) error
 }

 // NetworkScanServicer defines the interface for network scanning used by the scheduler.
@@ -67,44 +73,46 @@ type JobReaperService interface {
 // It runs multiple concurrent loops for renewal checks, job processing, agent health checks,
 // and notification processing.
 type Scheduler struct {
-	renewalService         RenewalServicer
-	jobService             JobServicer
-	agentService           AgentServicer
-	notificationService    NotificationServicer
-	networkScanService     NetworkScanServicer
-	digestService          DigestServicer
-	healthCheckService     HealthCheckServicer
-	cloudDiscoveryService  CloudDiscoveryServicer
-	jobReaper              JobReaperService
-	logger                 *slog.Logger
+	renewalService        RenewalServicer
+	jobService            JobServicer
+	agentService          AgentServicer
+	notificationService   NotificationServicer
+	networkScanService    NetworkScanServicer
+	digestService         DigestServicer
+	healthCheckService    HealthCheckServicer
+	cloudDiscoveryService CloudDiscoveryServicer
+	jobReaper             JobReaperService
+	logger                *slog.Logger

 	// Configurable tick intervals
-	renewalCheckInterval            time.Duration
-	jobProcessorInterval            time.Duration
-	jobRetryInterval                time.Duration
-	agentHealthCheckInterval        time.Duration
-	notificationProcessInterval     time.Duration
-	shortLivedExpiryCheckInterval   time.Duration
-	networkScanInterval             time.Duration
-	digestInterval                  time.Duration
-	healthCheckInterval             time.Duration
-	cloudDiscoveryInterval          time.Duration
-	jobTimeoutInterval              time.Duration
-	awaitingCSRTimeout              time.Duration
-	awaitingApprovalTimeout         time.Duration
+	renewalCheckInterval          time.Duration
+	jobProcessorInterval          time.Duration
+	jobRetryInterval              time.Duration
+	agentHealthCheckInterval      time.Duration
+	notificationProcessInterval   time.Duration
+	notificationRetryInterval     time.Duration
+	shortLivedExpiryCheckInterval time.Duration
+	networkScanInterval           time.Duration
+	digestInterval                time.Duration
+	healthCheckInterval           time.Duration
+	cloudDiscoveryInterval        time.Duration
+	jobTimeoutInterval            time.Duration
+	awaitingCSRTimeout            time.Duration
+	awaitingApprovalTimeout       time.Duration

 	// Idempotency guards: prevent duplicate execution of slow jobs
-	renewalCheckRunning           atomic.Bool
-	jobProcessorRunning           atomic.Bool
-	jobRetryRunning               atomic.Bool
-	agentHealthCheckRunning       atomic.Bool
-	notificationProcessRunning    atomic.Bool
-	shortLivedExpiryCheckRunning  atomic.Bool
-	networkScanRunning            atomic.Bool
-	digestRunning                 atomic.Bool
-	healthCheckRunning            atomic.Bool
-	cloudDiscoveryRunning         atomic.Bool
-	jobTimeoutRunning             atomic.Bool
+	renewalCheckRunning          atomic.Bool
+	jobProcessorRunning          atomic.Bool
+	jobRetryRunning              atomic.Bool
+	agentHealthCheckRunning      atomic.Bool
+	notificationProcessRunning   atomic.Bool
+	notificationRetryRunning     atomic.Bool
+	shortLivedExpiryCheckRunning atomic.Bool
+	networkScanRunning           atomic.Bool
+	digestRunning                atomic.Bool
+	healthCheckRunning           atomic.Bool
+	cloudDiscoveryRunning        atomic.Bool
+	jobTimeoutRunning            atomic.Bool

 	// Graceful shutdown: wait for in-flight work to complete
 	wg sync.WaitGroup
@@ -133,6 +141,7 @@ func NewScheduler(
 		jobRetryInterval:              5 * time.Minute,
 		agentHealthCheckInterval:      2 * time.Minute,
 		notificationProcessInterval:   1 * time.Minute,
+		notificationRetryInterval:     2 * time.Minute,
 		shortLivedExpiryCheckInterval: 30 * time.Second,
 		networkScanInterval:           6 * time.Hour,
 		digestInterval:                24 * time.Hour,
@@ -180,6 +189,13 @@ func (s *Scheduler) SetNotificationProcessInterval(d time.Duration) {
 	s.notificationProcessInterval = d
 }

+// SetNotificationRetryInterval configures the interval for the failed-notification
+// retry sweep (coverage gap I-005). Defaults to 2 minutes; honors
+// CERTCTL_NOTIFICATION_RETRY_INTERVAL when wired from config.
+func (s *Scheduler) SetNotificationRetryInterval(d time.Duration) {
+	s.notificationRetryInterval = d
+}
+
 // SetNetworkScanInterval configures the interval for network scanning.
 func (s *Scheduler) SetNetworkScanInterval(d time.Duration) {
 	s.networkScanInterval = d
@@ -212,7 +228,6 @@ func (s *Scheduler) SetCloudDiscoveryInterval(d time.Duration) {
 	s.cloudDiscoveryInterval = d
 }

-
 // SetJobReaperService sets the job reaper service (I-003).
 func (s *Scheduler) SetJobReaperService(jr JobReaperService) {
 	s.jobReaper = jr
@@ -232,6 +247,7 @@ func (s *Scheduler) SetAwaitingCSRTimeout(d time.Duration) {
 func (s *Scheduler) SetAwaitingApprovalTimeout(d time.Duration) {
 	s.awaitingApprovalTimeout = d
 }
+
 // Start initiates all background scheduler loops. It returns a channel that signals
 // when the scheduler has started all loops. The scheduler runs until the context is cancelled.
 func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
@@ -242,10 +258,11 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {

 		// Track all loop goroutines in the WaitGroup so WaitForCompletion
 		// blocks until they've fully exited (prevents test races).
-		// Base count is 7: renewal, job processor, job retry (I-001),
-		// job timeout (I-003), agent health, notification, short-lived expiry. Optional loops
-		// (network scan, digest, health check, cloud discovery) add to this.
-		loopCount := 7
+		// Base count is 8: renewal, job processor, job retry (I-001),
+		// job timeout (I-003), agent health, notification, notification retry
+		// (I-005), short-lived expiry. Optional loops (network scan, digest,
+		// health check, cloud discovery) add to this.
+		loopCount := 8
 		if s.networkScanService != nil {
 			loopCount++
 		}
@@ -266,6 +283,7 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
 		go func() { defer s.wg.Done(); s.jobTimeoutLoop(ctx) }()
 		go func() { defer s.wg.Done(); s.agentHealthCheckLoop(ctx) }()
 		go func() { defer s.wg.Done(); s.notificationProcessLoop(ctx) }()
+		go func() { defer s.wg.Done(); s.notificationRetryLoop(ctx) }()
 		go func() { defer s.wg.Done(); s.shortLivedExpiryCheckLoop(ctx) }()
 		if s.networkScanService != nil {
 			go func() { defer s.wg.Done(); s.networkScanLoop(ctx) }()
@@ -597,6 +615,64 @@ func (s *Scheduler) runNotificationProcess(ctx context.Context) {
 	}
 }

+// notificationRetryLoop runs every notificationRetryInterval and transitions
+// eligible Failed notifications back to Pending so the notification processor
+// can pick them up again. Closes coverage gap I-005 — NotificationService.
+// RetryFailedNotifications had no runtime caller prior to this loop being
+// wired. Runs immediately on start, then every interval.
+// Uses atomic.Bool to prevent duplicate execution if the previous retry sweep
+// is still running. Mirrors the I-001 jobRetryLoop topology byte-for-byte.
+func (s *Scheduler) notificationRetryLoop(ctx context.Context) {
+	ticker := time.NewTicker(s.notificationRetryInterval)
+	defer ticker.Stop()
+
+	// Run immediately on start (with idempotency guard)
+	s.notificationRetryRunning.Store(true)
+	s.wg.Add(1)
+	go func() {
+		defer s.wg.Done()
+		defer s.notificationRetryRunning.Store(false)
+		s.runNotificationRetry(ctx)
+	}()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			if !s.notificationRetryRunning.CompareAndSwap(false, true) {
+				s.logger.Warn("notification retry still running, skipping tick")
+				continue
+			}
+			s.wg.Add(1)
+			go func() {
+				defer s.wg.Done()
+				defer s.notificationRetryRunning.Store(false)
+				s.runNotificationRetry(ctx)
+			}()
+		}
+	}
+}
+
+// runNotificationRetry executes a single failed-notification retry cycle with
+// error recovery. Uses a 2-minute per-tick timeout matching runJobRetry;
+// RetryFailedNotifications issues one SELECT and one UPDATE per eligible row
+// (cheap), so this headroom covers very large failure backlogs without
+// starving the loop. The service layer swallows per-row send errors (mirrors
+// ProcessPendingNotifications) and only returns the List error from the
+// initial ListRetryEligible call.
+func (s *Scheduler) runNotificationRetry(ctx context.Context) {
+	opCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
+	defer cancel()
+	if err := s.notificationService.RetryFailedNotifications(opCtx); err != nil {
+		s.logger.Error("notification retry failed",
+			"error", err,
+			"interval", s.notificationRetryInterval.String())
+	} else {
+		s.logger.Debug("notification retry completed")
+	}
+}
+
 // shortLivedExpiryCheckLoop runs every shortLivedExpiryCheckInterval and marks expired
 // short-lived certificates. For certs with TTL < 1 hour, expiry IS revocation —
 // no CRL/OCSP needed.