mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-11 09:58:52 +00:00
I-005: notification retry loop + dead-letter queue
Critical alerts can no longer be silently dropped by a transient
notifier failure. Failed notification attempts now ride an exponential
backoff retry loop, with a 5-attempt budget before promotion to the
dead-letter queue for operator intervention.
Schema (migration 000016, idempotent):
- retry_count INTEGER NOT NULL DEFAULT 0
- next_retry_at TIMESTAMPTZ
- last_error TEXT
- idx_notification_events_retry_sweep partial index
(next_retry_at) WHERE status='failed' AND next_retry_at IS NOT NULL
Dead rows clear next_retry_at so the index stops matching them.
Service contract:
- NotificationService.RetryFailedNotifications drives 2^n-minute
exponential backoff capped at 1h (notifRetryBackoffCap) with
5-attempt budget (notifRetryMaxAttempts).
- Exhaustion (RetryCount >= notifRetryMaxAttempts-1) promotes to
status='dead' via MarkAsDead.
- Non-terminal failures record via RecordFailedAttempt.
- Success path promotes to 'sent' without touching retry_count
(audit preserves "delivered on attempt N").
- Missing-notifier branch defensively promotes to 'sent' to avoid
wedging a row on a deleted channel.
- RequeueNotification operator escape hatch atomically resets
retry_count -> 0, next_retry_at -> NULL, last_error -> NULL,
status -> pending via notifRepo.Requeue.
Scheduler:
- New always-on notificationRetryLoop wired into the base loop set at
CERTCTL_NOTIFICATION_RETRY_INTERVAL (default 2m).
- sync/atomic.Bool idempotency guard.
- sync.WaitGroup shutdown drain via WaitForCompletion.
StatsService:
- SetNotifRepo setter pattern preserves 9 pre-existing
NewStatsService call sites (main.go + stats_test.go + 8 digest
tests) without touching the constructor signature.
- DashboardSummary.NotificationsDead populated via
notifRepo.CountByStatus(ctx, "dead") — nil-safe when unwired
(reports zero on systems without a notification repository).
- CountByStatus error is non-fatal (dashboard summary is
best-effort for this field).
- Prometheus certctl_notification_dead_total counter emitted from
the same snapshot.
Handler:
- New POST /api/v1/notifications/{id}/requeue endpoint.
- dead status surfaces to MCP + CLI.
Frontend:
- NotificationsPage gains two-tab toolbar ("All" / "Dead letter")
with queryKey: ['notifications', activeTab] so switching tabs
doesn't serve stale data until the 30s refetch.
- Dead rows surface "Retry {n}/5" + truncated last_error with
full-text title tooltip.
- Requeue mutation wrapped as
mutationFn: (id: string) => requeueNotification(id)
to prevent react-query v5's positional context argument from
leaking into the API client — pinned against future refactors
by strict-match toHaveBeenCalledWith('notif-dead-001') in
NotificationsPage.test.tsx:181.
Closes I-005.
This commit is contained in:
+22
-3
@@ -974,9 +974,13 @@ func registerAuditTools(s *gomcp.Server, c *Client) {
|
||||
func registerNotificationTools(s *gomcp.Server, c *Client) {
|
||||
gomcp.AddTool(s, &gomcp.Tool{
|
||||
Name: "certctl_list_notifications",
|
||||
Description: "List notification events (expiration warnings, renewal/deployment results, policy violations, revocations).",
|
||||
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||
data, err := c.Get("/api/v1/notifications", paginationQuery(input.Page, input.PerPage))
|
||||
Description: "List notification events (expiration warnings, renewal/deployment results, policy violations, revocations). Optional status filter supports the I-005 Dead letter tab (status=dead).",
|
||||
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListNotificationsInput) (*gomcp.CallToolResult, any, error) {
|
||||
q := paginationQuery(input.Page, input.PerPage)
|
||||
if input.Status != "" {
|
||||
q.Set("status", input.Status)
|
||||
}
|
||||
data, err := c.Get("/api/v1/notifications", q)
|
||||
if err != nil {
|
||||
return errorResult(err)
|
||||
}
|
||||
@@ -1004,6 +1008,21 @@ func registerNotificationTools(s *gomcp.Server, c *Client) {
|
||||
}
|
||||
return textResult(data)
|
||||
})
|
||||
|
||||
// I-005: requeue a dead-letter notification. Flips status from 'dead'
|
||||
// back to 'pending' and clears next_retry_at so the retry sweep picks
|
||||
// the notification up on its next tick. Operator-triggered; the tool
|
||||
// is the MCP counterpart of the GUI's Dead letter tab "Requeue" button.
|
||||
gomcp.AddTool(s, &gomcp.Tool{
|
||||
Name: "certctl_requeue_notification",
|
||||
Description: "Requeue a dead notification back to pending so the retry sweep can deliver it again. Used to recover from persistent delivery failures after the underlying issue (SMTP config, webhook endpoint, etc.) has been fixed.",
|
||||
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||
data, err := c.Post("/api/v1/notifications/"+input.ID+"/requeue", nil)
|
||||
if err != nil {
|
||||
return errorResult(err)
|
||||
}
|
||||
return textResult(data)
|
||||
})
|
||||
}
|
||||
|
||||
// ── Stats ───────────────────────────────────────────────────────────
|
||||
|
||||
@@ -182,6 +182,16 @@ type RejectJobInput struct {
|
||||
Reason string `json:"reason,omitempty" jsonschema:"Reason for rejection"`
|
||||
}
|
||||
|
||||
// ── Notifications ───────────────────────────────────────────────────
|
||||
|
||||
// ListNotificationsInput adds the I-005 status filter on top of the standard
|
||||
// pagination params. Status="dead" drives the Dead letter tab use case;
|
||||
// empty status preserves the pre-I-005 list-all behavior.
|
||||
type ListNotificationsInput struct {
|
||||
ListParams
|
||||
Status string `json:"status,omitempty" jsonschema:"Filter by status: pending, sent, failed, dead, read"`
|
||||
}
|
||||
|
||||
// ── Policies ────────────────────────────────────────────────────────
|
||||
|
||||
type CreatePolicyInput struct {
|
||||
|
||||
Reference in New Issue
Block a user