I-005: notification retry loop + dead-letter queue

Critical alerts can no longer be silently dropped by a transient notifier failure. Failed notification attempts now ride an exponential backoff retry loop, with a 5-attempt budget before promotion to the dead-letter queue for operator intervention. Schema (migration 000016, idempotent): - retry_count INTEGER NOT NULL DEFAULT 0 - next_retry_at TIMESTAMPTZ - last_error TEXT - idx_notification_events_retry_sweep partial index (next_retry_at) WHERE status='failed' AND next_retry_at IS NOT NULL Dead rows clear next_retry_at so the index stops matching them. Service contract: - NotificationService.RetryFailedNotifications drives 2^n-minute exponential backoff capped at 1h (notifRetryBackoffCap) with 5-attempt budget (notifRetryMaxAttempts). - Exhaustion (RetryCount >= notifRetryMaxAttempts-1) promotes to status='dead' via MarkAsDead. - Non-terminal failures record via RecordFailedAttempt. - Success path promotes to 'sent' without touching retry_count (audit preserves "delivered on attempt N"). - Missing-notifier branch defensively promotes to 'sent' to avoid wedging a row on a deleted channel. - RequeueNotification operator escape hatch atomically resets retry_count -> 0, next_retry_at -> NULL, last_error -> NULL, status -> pending via notifRepo.Requeue. Scheduler: - New always-on notificationRetryLoop wired into the base loop set at CERTCTL_NOTIFICATION_RETRY_INTERVAL (default 2m). - sync/atomic.Bool idempotency guard. - sync.WaitGroup shutdown drain via WaitForCompletion. StatsService: - SetNotifRepo setter pattern preserves 9 pre-existing NewStatsService call sites (main.go + stats_test.go + 8 digest tests) without touching the constructor signature. - DashboardSummary.NotificationsDead populated via notifRepo.CountByStatus(ctx, "dead") — nil-safe when unwired (reports zero on systems without a notification repository). - CountByStatus error is non-fatal (dashboard summary is best-effort for this field). - Prometheus certctl_notification_dead_total counter emitted from the same snapshot. Handler: - New POST /api/v1/notifications/{id}/requeue endpoint. - dead status surfaces to MCP + CLI. Frontend: - NotificationsPage gains two-tab toolbar ("All" / "Dead letter") with queryKey: ['notifications', activeTab] so switching tabs doesn't serve stale data until the 30s refetch. - Dead rows surface "Retry {n}/5" + truncated last_error with full-text title tooltip. - Requeue mutation wrapped as mutationFn: (id: string) => requeueNotification(id) to prevent react-query v5's positional context argument from leaking into the API client — pinned against future refactors by strict-match toHaveBeenCalledWith('notif-dead-001') in NotificationsPage.test.tsx:181. Closes I-005.
2026-08-06 11:07:48 +00:00 · 2026-04-19 15:17:27 +00:00
parent 707d8de4fb
commit 675b87ba63
33 changed files with 3758 additions and 228 deletions
@@ -17,7 +17,6 @@ import (
 	"github.com/shankar0123/certctl/internal/api/middleware"
 	"github.com/shankar0123/certctl/internal/api/router"
 	"github.com/shankar0123/certctl/internal/config"
-	"github.com/shankar0123/certctl/internal/domain"
 	discoveryawssm "github.com/shankar0123/certctl/internal/connector/discovery/awssm"
 	discoveryazurekv "github.com/shankar0123/certctl/internal/connector/discovery/azurekv"
 	discoverygcpsm "github.com/shankar0123/certctl/internal/connector/discovery/gcpsm"
@@ -26,6 +25,7 @@ import (
 	notifypagerduty "github.com/shankar0123/certctl/internal/connector/notifier/pagerduty"
 	notifyslack "github.com/shankar0123/certctl/internal/connector/notifier/slack"
 	notifyteams "github.com/shankar0123/certctl/internal/connector/notifier/teams"
+	"github.com/shankar0123/certctl/internal/domain"
 	"github.com/shankar0123/certctl/internal/repository/postgres"
 	"github.com/shankar0123/certctl/internal/scheduler"
 	"github.com/shankar0123/certctl/internal/service"
@@ -353,6 +353,12 @@ func main() {

 	// Initialize stats and metrics services
 	statsService := service.NewStatsService(certificateRepo, jobRepo, agentRepo)
+	// I-005: wire the notification repository so DashboardSummary.NotificationsDead
+	// is populated, which in turn drives the Prometheus counter
+	// certctl_notification_dead_total in GetPrometheusMetrics. Setter
+	// pattern keeps NewStatsService's nine call sites (main.go + stats_test.go
+	// + 8 digest_test.go sites) untouched.
+	statsService.SetNotifRepo(notificationRepo)
 	logger.Info("initialized stats service")

 	// Initialize API handlers
@@ -447,6 +453,14 @@ func main() {
 	sched.SetJobRetryInterval(cfg.Scheduler.RetryInterval)
 	sched.SetAgentHealthCheckInterval(cfg.Scheduler.AgentHealthCheckInterval)
 	sched.SetNotificationProcessInterval(cfg.Scheduler.NotificationProcessInterval)
+	// I-005: drive the failed-notification retry sweep. Runs every
+	// NotificationRetryInterval (default 2m, CERTCTL_NOTIFICATION_RETRY_INTERVAL)
+	// and transitions eligible Failed notifications whose next_retry_at has
+	// arrived back to Pending so the notification processor picks them up on
+	// its next tick. Kept adjacent to the notification processor setter
+	// because they share the NotificationServicer dependency (same placement
+	// pattern as I-001's SetJobRetryInterval above).
+	sched.SetNotificationRetryInterval(cfg.Scheduler.NotificationRetryInterval)
 	if cfg.NetworkScan.Enabled {
 		sched.SetNetworkScanInterval(cfg.NetworkScan.ScanInterval)
 		logger.Info("network scanning enabled", "interval", cfg.NetworkScan.ScanInterval.String())
@@ -469,7 +483,6 @@ func main() {
 			"sources", cloudDiscoveryService.SourceCount())
 	}

-
 	// Wire job timeout reaper (I-003)
 	sched.SetJobReaperService(jobService)
 	sched.SetJobTimeoutInterval(cfg.Scheduler.JobTimeoutInterval)
@@ -489,28 +502,28 @@ func main() {
 	// Build the API router with all handlers
 	apiRouter := router.New()
 	apiRouter.RegisterHandlers(router.HandlerRegistry{
-		Certificates:  certificateHandler,
-		Issuers:       issuerHandler,
-		Targets:       targetHandler,
-		Agents:        agentHandler,
-		Jobs:          jobHandler,
-		Policies:      policyHandler,
-		Profiles:      profileHandler,
-		Teams:         teamHandler,
-		Owners:        ownerHandler,
-		AgentGroups:   agentGroupHandler,
-		Audit:         auditHandler,
-		Notifications: notificationHandler,
-		Stats:         statsHandler,
-		Metrics:       metricsHandler,
-		Health:        healthHandler,
-		Discovery:     discoveryHandler,
-		NetworkScan:   networkScanHandler,
-		Verification:  verificationHandler,
-		Export:        exportHandler,
-		Digest:        *digestHandler,
-		HealthChecks:     healthCheckHandler,
-		BulkRevocation:   bulkRevocationHandler,
+		Certificates:   certificateHandler,
+		Issuers:        issuerHandler,
+		Targets:        targetHandler,
+		Agents:         agentHandler,
+		Jobs:           jobHandler,
+		Policies:       policyHandler,
+		Profiles:       profileHandler,
+		Teams:          teamHandler,
+		Owners:         ownerHandler,
+		AgentGroups:    agentGroupHandler,
+		Audit:          auditHandler,
+		Notifications:  notificationHandler,
+		Stats:          statsHandler,
+		Metrics:        metricsHandler,
+		Health:         healthHandler,
+		Discovery:      discoveryHandler,
+		NetworkScan:    networkScanHandler,
+		Verification:   verificationHandler,
+		Export:         exportHandler,
+		Digest:         *digestHandler,
+		HealthChecks:   healthCheckHandler,
+		BulkRevocation: bulkRevocationHandler,
 	})
 	// Register EST (RFC 7030) handlers if enabled
 	if cfg.EST.Enabled {
@@ -845,4 +858,3 @@ func preflightSCEPChallengePassword(enabled bool, challengePassword string) erro
 	}
 	return nil
 }
-