I-005: notification retry loop + dead-letter queue

Critical alerts can no longer be silently dropped by a transient notifier failure. Failed notification attempts now ride an exponential backoff retry loop, with a 5-attempt budget before promotion to the dead-letter queue for operator intervention. Schema (migration 000016, idempotent): - retry_count INTEGER NOT NULL DEFAULT 0 - next_retry_at TIMESTAMPTZ - last_error TEXT - idx_notification_events_retry_sweep partial index (next_retry_at) WHERE status='failed' AND next_retry_at IS NOT NULL Dead rows clear next_retry_at so the index stops matching them. Service contract: - NotificationService.RetryFailedNotifications drives 2^n-minute exponential backoff capped at 1h (notifRetryBackoffCap) with 5-attempt budget (notifRetryMaxAttempts). - Exhaustion (RetryCount >= notifRetryMaxAttempts-1) promotes to status='dead' via MarkAsDead. - Non-terminal failures record via RecordFailedAttempt. - Success path promotes to 'sent' without touching retry_count (audit preserves "delivered on attempt N"). - Missing-notifier branch defensively promotes to 'sent' to avoid wedging a row on a deleted channel. - RequeueNotification operator escape hatch atomically resets retry_count -> 0, next_retry_at -> NULL, last_error -> NULL, status -> pending via notifRepo.Requeue. Scheduler: - New always-on notificationRetryLoop wired into the base loop set at CERTCTL_NOTIFICATION_RETRY_INTERVAL (default 2m). - sync/atomic.Bool idempotency guard. - sync.WaitGroup shutdown drain via WaitForCompletion. StatsService: - SetNotifRepo setter pattern preserves 9 pre-existing NewStatsService call sites (main.go + stats_test.go + 8 digest tests) without touching the constructor signature. - DashboardSummary.NotificationsDead populated via notifRepo.CountByStatus(ctx, "dead") — nil-safe when unwired (reports zero on systems without a notification repository). - CountByStatus error is non-fatal (dashboard summary is best-effort for this field). - Prometheus certctl_notification_dead_total counter emitted from the same snapshot. Handler: - New POST /api/v1/notifications/{id}/requeue endpoint. - dead status surfaces to MCP + CLI. Frontend: - NotificationsPage gains two-tab toolbar ("All" / "Dead letter") with queryKey: ['notifications', activeTab] so switching tabs doesn't serve stale data until the 30s refetch. - Dead rows surface "Retry {n}/5" + truncated last_error with full-text title tooltip. - Requeue mutation wrapped as mutationFn: (id: string) => requeueNotification(id) to prevent react-query v5's positional context argument from leaking into the API client — pinned against future refactors by strict-match toHaveBeenCalledWith('notif-dead-001') in NotificationsPage.test.tsx:181. Closes I-005.
2026-06-07 13:51:36 +00:00 · 2026-04-19 15:17:27 +00:00
parent 707d8de4fb
commit 675b87ba63
33 changed files with 3758 additions and 228 deletions
@@ -34,8 +34,14 @@ type AgentServicer interface {
 }

 // NotificationServicer defines the interface for notification processing used by the scheduler.
+//
+// RetryFailedNotifications was added to close coverage gap I-005: the retry
+// sweep transitions eligible Failed notifications to Pending on an independent
+// tick, using exponential backoff with a 1h cap and a 5-attempt DLQ budget.
+// Mirrors the I-001 job retry loop topology.
 type NotificationServicer interface {
 	ProcessPendingNotifications(ctx context.Context) error
+	RetryFailedNotifications(ctx context.Context) error
 }

 // NetworkScanServicer defines the interface for network scanning used by the scheduler.
@@ -67,44 +73,46 @@ type JobReaperService interface {
 // It runs multiple concurrent loops for renewal checks, job processing, agent health checks,
 // and notification processing.
 type Scheduler struct {
-	renewalService         RenewalServicer
-	jobService             JobServicer
-	agentService           AgentServicer
-	notificationService    NotificationServicer
-	networkScanService     NetworkScanServicer
-	digestService          DigestServicer
-	healthCheckService     HealthCheckServicer
-	cloudDiscoveryService  CloudDiscoveryServicer
-	jobReaper              JobReaperService
-	logger                 *slog.Logger
+	renewalService        RenewalServicer
+	jobService            JobServicer
+	agentService          AgentServicer
+	notificationService   NotificationServicer
+	networkScanService    NetworkScanServicer
+	digestService         DigestServicer
+	healthCheckService    HealthCheckServicer
+	cloudDiscoveryService CloudDiscoveryServicer
+	jobReaper             JobReaperService
+	logger                *slog.Logger

 	// Configurable tick intervals
-	renewalCheckInterval            time.Duration
-	jobProcessorInterval            time.Duration
-	jobRetryInterval                time.Duration
-	agentHealthCheckInterval        time.Duration
-	notificationProcessInterval     time.Duration
-	shortLivedExpiryCheckInterval   time.Duration
-	networkScanInterval             time.Duration
-	digestInterval                  time.Duration
-	healthCheckInterval             time.Duration
-	cloudDiscoveryInterval          time.Duration
-	jobTimeoutInterval              time.Duration
-	awaitingCSRTimeout              time.Duration
-	awaitingApprovalTimeout         time.Duration
+	renewalCheckInterval          time.Duration
+	jobProcessorInterval          time.Duration
+	jobRetryInterval              time.Duration
+	agentHealthCheckInterval      time.Duration
+	notificationProcessInterval   time.Duration
+	notificationRetryInterval     time.Duration
+	shortLivedExpiryCheckInterval time.Duration
+	networkScanInterval           time.Duration
+	digestInterval                time.Duration
+	healthCheckInterval           time.Duration
+	cloudDiscoveryInterval        time.Duration
+	jobTimeoutInterval            time.Duration
+	awaitingCSRTimeout            time.Duration
+	awaitingApprovalTimeout       time.Duration

 	// Idempotency guards: prevent duplicate execution of slow jobs
-	renewalCheckRunning           atomic.Bool
-	jobProcessorRunning           atomic.Bool
-	jobRetryRunning               atomic.Bool
-	agentHealthCheckRunning       atomic.Bool
-	notificationProcessRunning    atomic.Bool
-	shortLivedExpiryCheckRunning  atomic.Bool
-	networkScanRunning            atomic.Bool
-	digestRunning                 atomic.Bool
-	healthCheckRunning            atomic.Bool
-	cloudDiscoveryRunning         atomic.Bool
-	jobTimeoutRunning             atomic.Bool
+	renewalCheckRunning          atomic.Bool
+	jobProcessorRunning          atomic.Bool
+	jobRetryRunning              atomic.Bool
+	agentHealthCheckRunning      atomic.Bool
+	notificationProcessRunning   atomic.Bool
+	notificationRetryRunning     atomic.Bool
+	shortLivedExpiryCheckRunning atomic.Bool
+	networkScanRunning           atomic.Bool
+	digestRunning                atomic.Bool
+	healthCheckRunning           atomic.Bool
+	cloudDiscoveryRunning        atomic.Bool
+	jobTimeoutRunning            atomic.Bool

 	// Graceful shutdown: wait for in-flight work to complete
 	wg sync.WaitGroup
@@ -133,6 +141,7 @@ func NewScheduler(
 		jobRetryInterval:              5 * time.Minute,
 		agentHealthCheckInterval:      2 * time.Minute,
 		notificationProcessInterval:   1 * time.Minute,
+		notificationRetryInterval:     2 * time.Minute,
 		shortLivedExpiryCheckInterval: 30 * time.Second,
 		networkScanInterval:           6 * time.Hour,
 		digestInterval:                24 * time.Hour,
@@ -180,6 +189,13 @@ func (s *Scheduler) SetNotificationProcessInterval(d time.Duration) {
 	s.notificationProcessInterval = d
 }

+// SetNotificationRetryInterval configures the interval for the failed-notification
+// retry sweep (coverage gap I-005). Defaults to 2 minutes; honors
+// CERTCTL_NOTIFICATION_RETRY_INTERVAL when wired from config.
+func (s *Scheduler) SetNotificationRetryInterval(d time.Duration) {
+	s.notificationRetryInterval = d
+}
+
 // SetNetworkScanInterval configures the interval for network scanning.
 func (s *Scheduler) SetNetworkScanInterval(d time.Duration) {
 	s.networkScanInterval = d
@@ -212,7 +228,6 @@ func (s *Scheduler) SetCloudDiscoveryInterval(d time.Duration) {
 	s.cloudDiscoveryInterval = d
 }

-
 // SetJobReaperService sets the job reaper service (I-003).
 func (s *Scheduler) SetJobReaperService(jr JobReaperService) {
 	s.jobReaper = jr
@@ -232,6 +247,7 @@ func (s *Scheduler) SetAwaitingCSRTimeout(d time.Duration) {
 func (s *Scheduler) SetAwaitingApprovalTimeout(d time.Duration) {
 	s.awaitingApprovalTimeout = d
 }
+
 // Start initiates all background scheduler loops. It returns a channel that signals
 // when the scheduler has started all loops. The scheduler runs until the context is cancelled.
 func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
@@ -242,10 +258,11 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {

 		// Track all loop goroutines in the WaitGroup so WaitForCompletion
 		// blocks until they've fully exited (prevents test races).
-		// Base count is 7: renewal, job processor, job retry (I-001),
-		// job timeout (I-003), agent health, notification, short-lived expiry. Optional loops
-		// (network scan, digest, health check, cloud discovery) add to this.
-		loopCount := 7
+		// Base count is 8: renewal, job processor, job retry (I-001),
+		// job timeout (I-003), agent health, notification, notification retry
+		// (I-005), short-lived expiry. Optional loops (network scan, digest,
+		// health check, cloud discovery) add to this.
+		loopCount := 8
 		if s.networkScanService != nil {
 			loopCount++
 		}
@@ -266,6 +283,7 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
 		go func() { defer s.wg.Done(); s.jobTimeoutLoop(ctx) }()
 		go func() { defer s.wg.Done(); s.agentHealthCheckLoop(ctx) }()
 		go func() { defer s.wg.Done(); s.notificationProcessLoop(ctx) }()
+		go func() { defer s.wg.Done(); s.notificationRetryLoop(ctx) }()
 		go func() { defer s.wg.Done(); s.shortLivedExpiryCheckLoop(ctx) }()
 		if s.networkScanService != nil {
 			go func() { defer s.wg.Done(); s.networkScanLoop(ctx) }()
@@ -597,6 +615,64 @@ func (s *Scheduler) runNotificationProcess(ctx context.Context) {
 	}
 }

+// notificationRetryLoop runs every notificationRetryInterval and transitions
+// eligible Failed notifications back to Pending so the notification processor
+// can pick them up again. Closes coverage gap I-005 — NotificationService.
+// RetryFailedNotifications had no runtime caller prior to this loop being
+// wired. Runs immediately on start, then every interval.
+// Uses atomic.Bool to prevent duplicate execution if the previous retry sweep
+// is still running. Mirrors the I-001 jobRetryLoop topology byte-for-byte.
+func (s *Scheduler) notificationRetryLoop(ctx context.Context) {
+	ticker := time.NewTicker(s.notificationRetryInterval)
+	defer ticker.Stop()
+
+	// Run immediately on start (with idempotency guard)
+	s.notificationRetryRunning.Store(true)
+	s.wg.Add(1)
+	go func() {
+		defer s.wg.Done()
+		defer s.notificationRetryRunning.Store(false)
+		s.runNotificationRetry(ctx)
+	}()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			if !s.notificationRetryRunning.CompareAndSwap(false, true) {
+				s.logger.Warn("notification retry still running, skipping tick")
+				continue
+			}
+			s.wg.Add(1)
+			go func() {
+				defer s.wg.Done()
+				defer s.notificationRetryRunning.Store(false)
+				s.runNotificationRetry(ctx)
+			}()
+		}
+	}
+}
+
+// runNotificationRetry executes a single failed-notification retry cycle with
+// error recovery. Uses a 2-minute per-tick timeout matching runJobRetry;
+// RetryFailedNotifications issues one SELECT and one UPDATE per eligible row
+// (cheap), so this headroom covers very large failure backlogs without
+// starving the loop. The service layer swallows per-row send errors (mirrors
+// ProcessPendingNotifications) and only returns the List error from the
+// initial ListRetryEligible call.
+func (s *Scheduler) runNotificationRetry(ctx context.Context) {
+	opCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
+	defer cancel()
+	if err := s.notificationService.RetryFailedNotifications(opCtx); err != nil {
+		s.logger.Error("notification retry failed",
+			"error", err,
+			"interval", s.notificationRetryInterval.String())
+	} else {
+		s.logger.Debug("notification retry completed")
+	}
+}
+
 // shortLivedExpiryCheckLoop runs every shortLivedExpiryCheckInterval and marks expired
 // short-lived certificates. For certs with TTL < 1 hour, expiry IS revocation —
 // no CRL/OCSP needed.
@@ -195,12 +195,25 @@ func (m *mockAgentService) MarkStaleAgentsOffline(ctx context.Context, interval
 }

 // mockNotificationService is a mock implementation for testing.
+//
+// Tracks ProcessPendingNotifications and RetryFailedNotifications separately.
+// retrySlowDelay and retryShouldError let tests exercise the retry loop
+// independently of the processor loop without coupling their timing/failure
+// modes (coverage gap I-005 — prior to the notificationRetryLoop being wired,
+// RetryFailedNotifications had no runtime caller).
 type mockNotificationService struct {
 	mu          sync.Mutex
 	callCount   int
 	callTimes   []time.Time
 	slowDelay   time.Duration
 	shouldError bool
+
+	// Retry loop tracking (coverage gap I-005)
+	retryCallCount      int
+	retryCallTimes      []time.Time
+	retrySlowDelay      time.Duration
+	retryShouldError    bool
+	retryCtxHasDeadline bool
 }

 func (m *mockNotificationService) ProcessPendingNotifications(ctx context.Context) error {
@@ -223,6 +236,42 @@ func (m *mockNotificationService) ProcessPendingNotifications(ctx context.Contex
 	return nil
 }

+// RetryFailedNotifications is the scheduler-driven counterpart to
+// ProcessPendingNotifications that closes coverage gap I-005. Prior to the
+// notificationRetryLoop being wired, notifications that hit status='failed'
+// orphaned there forever — no retry, no DLQ, no escalation. The service-layer
+// method exists to sweep failed rows whose next_retry_at has elapsed, but
+// without a scheduler caller the sweep never runs in production.
+//
+// This mock mirrors mockJobService.RetryFailedJobs's shape: a retry-only field
+// cluster so callers can dial retrySlowDelay / retryShouldError without
+// perturbing ProcessPendingNotifications's timing, and retryCtxHasDeadline so
+// the ContextDeadlineRespected test can assert the scheduler is passing a
+// per-tick context.WithTimeout rather than the raw shutdown ctx.
+func (m *mockNotificationService) RetryFailedNotifications(ctx context.Context) error {
+	m.mu.Lock()
+	m.retryCallCount++
+	m.retryCallTimes = append(m.retryCallTimes, time.Now())
+	// Track whether context has a deadline set — the scheduler must wrap each
+	// tick in a bounded context so a hung sweep can't stall shutdown.
+	_, hasDeadline := ctx.Deadline()
+	m.retryCtxHasDeadline = hasDeadline
+	m.mu.Unlock()
+
+	if m.retrySlowDelay > 0 {
+		select {
+		case <-time.After(m.retrySlowDelay):
+		case <-ctx.Done():
+			return ctx.Err()
+		}
+	}
+
+	if m.retryShouldError {
+		return context.Canceled
+	}
+	return nil
+}
+
 // mockNetworkScanService is a mock implementation for testing.
 type mockNetworkScanService struct {
 	mu          sync.Mutex
@@ -1358,3 +1407,221 @@ func TestScheduler_JobTimeoutLoop_ContextDeadlineRespected(t *testing.T) {
 	}
 	t.Log("timeout reaper context deadline verified")
 }
+
+// ─── NotificationRetryLoop tests (coverage gap I-005) ────────────────────────
+//
+// These four tests are the scheduler-level Red half of the I-005 fix. They
+// mirror the I-001 jobRetryLoop triplet (CallsService / IdempotencyGuard /
+// WaitForCompletion) plus the I-003 ContextDeadlineRespected shape.
+//
+// All four use the same "quiet every other loop" pattern so the only tick
+// activity visible on notificationMock is the retry loop under test. JobTimeout
+// is intentionally left unconfigured — SetJobReaperService isn't called, so the
+// timeout loop is dormant (same convention the I-001 tests follow).
+
+// TestScheduler_NotificationRetryLoop_CallsService verifies that the
+// notification retry loop invokes NotificationService.RetryFailedNotifications
+// on each tick. Closes coverage gap I-005 — prior to the loop being wired,
+// RetryFailedNotifications had no runtime caller and failed notification_events
+// rows orphaned at status='failed' forever (no retry, no DLQ, no escalation).
+//
+// Unlike the jobRetryLoop test, there is no maxRetries advisory constant to
+// forward: the max_attempts limit on notification retries lives on the row
+// itself (retry_count column introduced by migration 000016), not in the call
+// signature.
+func TestScheduler_NotificationRetryLoop_CallsService(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(os.Stderr, nil))
+	renewalMock := &mockRenewalService{}
+	jobMock := &mockJobService{}
+	agentMock := &mockAgentService{}
+	notificationMock := &mockNotificationService{}
+	networkMock := &mockNetworkScanService{}
+
+	sched := NewScheduler(renewalMock, jobMock, agentMock, notificationMock, networkMock, logger)
+	// Quiet every other loop so only the retry loop's calls are visible on notificationMock.
+	sched.SetRenewalCheckInterval(10 * time.Second)
+	sched.SetJobProcessorInterval(10 * time.Second)
+	sched.SetAgentHealthCheckInterval(10 * time.Second)
+	sched.SetNotificationProcessInterval(10 * time.Second)
+	sched.SetNetworkScanInterval(10 * time.Second)
+	sched.SetJobRetryInterval(10 * time.Second)
+	sched.SetNotificationRetryInterval(50 * time.Millisecond)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	startedChan := sched.Start(ctx)
+	<-startedChan
+
+	// Run long enough for the immediate start + at least one tick.
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+	_ = sched.WaitForCompletion(2 * time.Second)
+
+	notificationMock.mu.Lock()
+	retryCount := notificationMock.retryCallCount
+	notificationMock.mu.Unlock()
+
+	if retryCount < 1 {
+		t.Fatalf("expected notification retry service to be called at least once, got %d", retryCount)
+	}
+	t.Logf("notification retry loop called %d times", retryCount)
+}
+
+// TestScheduler_NotificationRetryLoop_IdempotencyGuard verifies that a slow
+// retry sweep does not cause overlapping executions. Mirrors the shape of
+// TestScheduler_JobRetryLoop_IdempotencyGuard.
+//
+// The guard is the atomic.Bool notificationRetryRunning in scheduler.go.
+// Without it, a 100ms tick against a 150ms operation would fire ~4 times in
+// 400ms; with the guard we expect ~2–3 calls. Anything above 3 is logged as a
+// warning (not a hard failure) so CI timing noise doesn't produce flakes.
+func TestScheduler_NotificationRetryLoop_IdempotencyGuard(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(os.Stderr, nil))
+	renewalMock := &mockRenewalService{}
+	jobMock := &mockJobService{}
+	agentMock := &mockAgentService{}
+	notificationMock := &mockNotificationService{
+		retrySlowDelay: 150 * time.Millisecond, // slower than tick interval
+	}
+	networkMock := &mockNetworkScanService{}
+
+	sched := NewScheduler(renewalMock, jobMock, agentMock, notificationMock, networkMock, logger)
+	sched.SetRenewalCheckInterval(10 * time.Second)
+	sched.SetJobProcessorInterval(10 * time.Second)
+	sched.SetAgentHealthCheckInterval(10 * time.Second)
+	sched.SetNotificationProcessInterval(10 * time.Second)
+	sched.SetNetworkScanInterval(10 * time.Second)
+	sched.SetJobRetryInterval(10 * time.Second)
+	sched.SetNotificationRetryInterval(100 * time.Millisecond)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	startedChan := sched.Start(ctx)
+	<-startedChan
+
+	time.Sleep(400 * time.Millisecond)
+
+	notificationMock.mu.Lock()
+	retryCount := notificationMock.retryCallCount
+	notificationMock.mu.Unlock()
+
+	// With a 150ms sweep and 100ms interval, a functioning guard should yield
+	// roughly 2–3 calls (immediate + any ticks whose previous sweep finished).
+	// Anything above 3 suggests the guard isn't holding.
+	if retryCount > 3 {
+		t.Logf("WARNING: retry called %d times in 400ms with 100ms interval and 150ms sweep — guard may not be working", retryCount)
+	}
+
+	t.Logf("notification retry idempotency guard: %d calls in 400ms (100ms interval, 150ms sweep)", retryCount)
+
+	cancel()
+	if err := sched.WaitForCompletion(2 * time.Second); err != nil {
+		t.Fatalf("WaitForCompletion should succeed: %v", err)
+	}
+}
+
+// TestScheduler_NotificationRetryLoop_WaitForCompletion verifies that a retry
+// sweep still in flight at shutdown is awaited by WaitForCompletion — the same
+// sync.WaitGroup contract every other loop satisfies. If the loop were to
+// return early without registering its goroutine on s.wg, this test would
+// either (a) observe retryCount==0 because the immediate-start sweep was never
+// launched, or (b) observe WaitForCompletion returning before the in-flight
+// sweep finished (elapsed < retrySlowDelay).
+func TestScheduler_NotificationRetryLoop_WaitForCompletion(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(os.Stderr, nil))
+	renewalMock := &mockRenewalService{}
+	jobMock := &mockJobService{}
+	agentMock := &mockAgentService{}
+	notificationMock := &mockNotificationService{
+		retrySlowDelay: 100 * time.Millisecond,
+	}
+	networkMock := &mockNetworkScanService{}
+
+	sched := NewScheduler(renewalMock, jobMock, agentMock, notificationMock, networkMock, logger)
+	sched.SetRenewalCheckInterval(10 * time.Second)
+	sched.SetJobProcessorInterval(10 * time.Second)
+	sched.SetAgentHealthCheckInterval(10 * time.Second)
+	sched.SetNotificationProcessInterval(10 * time.Second)
+	sched.SetNetworkScanInterval(10 * time.Second)
+	sched.SetJobRetryInterval(10 * time.Second)
+	sched.SetNotificationRetryInterval(50 * time.Millisecond)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	startedChan := sched.Start(ctx)
+	<-startedChan
+
+	// Let the immediate-start retry goroutine begin its 100ms sweep.
+	time.Sleep(30 * time.Millisecond)
+
+	// Initiate shutdown mid-sweep.
+	cancel()
+
+	start := time.Now()
+	err := sched.WaitForCompletion(5 * time.Second)
+	elapsed := time.Since(start)
+
+	if err != nil {
+		t.Fatalf("WaitForCompletion should not error: %v", err)
+	}
+	if elapsed > 5*time.Second {
+		t.Fatalf("WaitForCompletion took longer than expected: %v", elapsed)
+	}
+
+	notificationMock.mu.Lock()
+	retryCount := notificationMock.retryCallCount
+	notificationMock.mu.Unlock()
+
+	if retryCount < 1 {
+		t.Fatalf("expected notification retry service to have started at least once before shutdown, got %d", retryCount)
+	}
+	t.Logf("notification retry loop graceful shutdown completed in %v after %d in-flight sweep(s)", elapsed, retryCount)
+}
+
+// TestScheduler_NotificationRetryLoop_ContextDeadlineRespected verifies that
+// each tick of the retry loop receives a context with a deadline set. Mirrors
+// TestScheduler_JobTimeoutLoop_ContextDeadlineRespected.
+//
+// The per-tick context.WithTimeout exists so a pathologically slow sweep (e.g.
+// a misbehaving DB lock) can't stall the rest of the scheduler's shutdown
+// sequence indefinitely — the wrapping context expires, the sweep returns
+// ctx.Err(), and the WaitGroup.Done() fires on schedule.
+func TestScheduler_NotificationRetryLoop_ContextDeadlineRespected(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(os.Stderr, nil))
+	renewalMock := &mockRenewalService{}
+	jobMock := &mockJobService{}
+	agentMock := &mockAgentService{}
+	notificationMock := &mockNotificationService{}
+	networkMock := &mockNetworkScanService{}
+
+	sched := NewScheduler(renewalMock, jobMock, agentMock, notificationMock, networkMock, logger)
+	sched.SetRenewalCheckInterval(10 * time.Second)
+	sched.SetJobProcessorInterval(10 * time.Second)
+	sched.SetAgentHealthCheckInterval(10 * time.Second)
+	sched.SetNotificationProcessInterval(10 * time.Second)
+	sched.SetNetworkScanInterval(10 * time.Second)
+	sched.SetJobRetryInterval(10 * time.Second)
+	sched.SetNotificationRetryInterval(50 * time.Millisecond)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	<-sched.Start(ctx)
+	time.Sleep(100 * time.Millisecond)
+	cancel()
+	if err := sched.WaitForCompletion(2 * time.Second); err != nil {
+		t.Fatalf("WaitForCompletion: %v", err)
+	}
+
+	notificationMock.mu.Lock()
+	hasDeadline := notificationMock.retryCtxHasDeadline
+	notificationMock.mu.Unlock()
+
+	if !hasDeadline {
+		t.Fatal("expected notification retry context to have a deadline set, but none found")
+	}
+	t.Log("notification retry context deadline verified")
+}