diff --git a/cmd/server/main.go b/cmd/server/main.go index f723701..063de02 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -325,6 +325,18 @@ func main() { notificationService := service.NewNotificationService(notificationRepo, notifierRegistry) notificationService.SetOwnerRepo(ownerRepo) + // Rank 4 of the 2026-05-03 Infisical deep-research deliverable + // (cowork/infisical-deep-research-results.md Part 5). Per-policy + // multi-channel expiry-alert metrics. Same instance is wired into + // the notification service (recording side, every + // SendThresholdAlertOnChannel call reports its outcome) AND into + // the metrics handler below (exposing side, Prometheus emitter + // reads the counters). Mirrors the VaultRenewalMetrics wiring + // pattern from the 2026-05-03 audit fix #5 — single instance, + // shared between recorder and exposer. + expiryAlertMetrics := service.NewExpiryAlertMetrics() + notificationService.SetExpiryAlertMetrics(expiryAlertMetrics) + // Create RevocationSvc with its dependencies revocationSvc := service.NewRevocationSvc(certificateRepo, revocationRepo, auditService) revocationSvc.SetTransactor(transactor) @@ -595,6 +607,11 @@ func main() { // Top-10 fix #5 (2026-05-03 audit): Vault PKI token-renewal counter. // Same instance the registry uses to record per-tick results. metricsHandler.SetVaultRenewals(vaultRenewalMetrics) + // Rank 4 of the 2026-05-03 Infisical deep-research deliverable: + // per-policy multi-channel expiry-alert counter. Same instance the + // notification service uses to record per-(channel, threshold, + // result) outcomes. + metricsHandler.SetExpiryAlerts(expiryAlertMetrics) // Bundle-5 / H-006: pass the *sql.DB pool so /ready can probe DB // connectivity via PingContext. /health stays shallow (liveness signal). healthHandler := handler.NewHealthHandler(cfg.Auth.Type, db) diff --git a/docs/connectors.md b/docs/connectors.md index eefeff5..e7d146f 100644 --- a/docs/connectors.md +++ b/docs/connectors.md @@ -1440,6 +1440,54 @@ type Connector interface { Built-in notifiers: **Email** (SMTP), **Webhook** (HTTP POST), **Slack** (incoming webhook), **Microsoft Teams** (MessageCard webhook), **PagerDuty** (Events API v2), and **OpsGenie** (Alert API v2). +### Routing expiry alerts across channels + +certctl-server runs a daily renewal-check loop that scans for managed certificates approaching expiry. For each cert that has crossed a configured threshold (default `[30, 14, 7, 0]` days), an `ExpirationWarning` notification is dispatched. **Pre-2026-05-03**, dispatch went exclusively via the `Email` channel — operators with PagerDuty / Slack / Teams / OpsGenie wired up received nothing at any threshold unless SMTP was also configured. Rank 4 of the 2026-05-03 Infisical deep-research deliverable closed that gap with a per-policy channel-matrix. + +**The matrix lives on `RenewalPolicy`:** + +```json +{ + "id": "rp-production", + "name": "Production CDN renewal policy", + "renewal_window_days": 30, + "alert_thresholds_days": [30, 14, 7, 0], + "alert_channels": { + "informational": ["Slack"], + "warning": ["Slack", "Email"], + "critical": ["PagerDuty", "OpsGenie", "Email"] + }, + "alert_severity_map": { + "30": "informational", + "14": "warning", + "7": "warning", + "0": "critical" + } +} +``` + +The runtime resolves the threshold's severity tier (via `alert_severity_map`, falling back to the default `30→informational, 14→warning, 7→warning, 0→critical` when unset), then dispatches one notification per channel listed under that tier in `alert_channels`. Each (cert, threshold, channel) triple is independently deduplicated via the `notification_events` table — a transient PagerDuty 5xx today does NOT suppress today's Slack alert, and tomorrow's renewal-loop tick will re-attempt the failed PagerDuty page. + +**Backwards compatibility.** A policy with `alert_channels` unset (or empty) falls through to `DefaultAlertChannels` which routes every tier to `["Email"]`. Operators who haven't touched their renewal-policy configs see exactly the pre-2026-05-03 behaviour, and SMTP-only deployments keep working as before. + +**Validation.** Off-enum severity tiers (anything other than `informational` / `warning` / `critical`) and off-enum channels (anything other than `Email` / `Webhook` / `Slack` / `Teams` / `PagerDuty` / `OpsGenie`) are silently dropped at the dispatch site — but the drop is recorded in the audit log as `expiration_alert_skipped_invalid_channel` so an operator can grep for typos. The `RenewalPolicyService.Create`/`Update` paths reject these at write time as well, so a fresh policy with bad values never persists. + +**Procurement playbook: "I want PagerDuty when a cert is 24h from expiry."** Configure your renewal policy with `alert_severity_map.0 = "critical"` (already the default) and `alert_channels.critical = ["PagerDuty", "Email"]`. Set the `CERTCTL_PAGERDUTY_ROUTING_KEY` env var on the server. Restart. The next renewal-loop tick that finds a cert at ≤0 days will create a PagerDuty incident via the Events API v2 AND email the cert owner. Confirm with `curl /api/v1/metrics/prometheus | grep certctl_expiry_alerts_total` — you'll see one `{channel="PagerDuty",threshold="0",result="success"}` series increment per critical-tier dispatch. + +**Operator runbook for "did the on-call team get paged?"** Run: + +```sql +SELECT created_at, metadata->>'channel' AS channel, metadata->>'threshold_days' AS threshold +FROM audit_events +WHERE event_type = 'expiration_alert_sent' + AND resource_id = '' +ORDER BY created_at DESC; +``` + +Each row corresponds to one fired alert. The `channel` metadata field tells you which notifier ran. Combined with the Prometheus `certctl_expiry_alerts_total{result="failure"}` counter, you have full forensic visibility on every dispatch attempt. + +**V3-Pro forward path.** Per-owner / per-team channel routing (route the Production-CDN cert's alerts to its dedicated owner's PagerDuty service, the Internal-API cert's alerts to a different one), calendar-aware suppression (no T-30 informational alerts on weekends for non-on-call teams), and escalation chains (T-1 unanswered for 30m → escalate to manager) are tracked on `cowork/WORKSPACE-ROADMAP.md` under "Adapter hardening" → "Multi-channel expiry alerts: per-owner routing". + ### Email (SMTP) Notifier The Email notifier sends transactional alerts and scheduled digests via SMTP. It bridges the connector-layer SMTP connector to the service-layer `Notifier` interface via the `NotifierAdapter`. Supports both plain text and HTML emails. diff --git a/docs/runbook-expiry-alerts.md b/docs/runbook-expiry-alerts.md new file mode 100644 index 0000000..f5e7db7 --- /dev/null +++ b/docs/runbook-expiry-alerts.md @@ -0,0 +1,225 @@ +# Runbook: certificate-expiry alerts (multi-channel) + +This runbook covers the per-policy multi-channel expiry-alert dispatch +path that ships in certctl post-2026-05-03 (Rank 4 of the Infisical +deep-research deliverable). It complements the operator-facing +[Routing expiry alerts across channels](connectors.md#routing-expiry-alerts-across-channels) +section in `docs/connectors.md`. + +Audience: a platform sysadmin or on-call engineer who needs to +configure, debug, or audit certctl's expiry-alert routing. Not a +walkthrough of how to install certctl — that lives in the README. + +--- + +## End-to-end flow + +``` + daily ticker (renewalCheckLoop) + │ + ▼ + RenewalService.CheckExpiringCertificates + │ + ┌────────────────┴────────────────┐ + │ for cert in expiring (≤30 days):│ + │ 1. Resolve RenewalPolicy │ + │ 2. Compute daysUntil │ + │ 3. updateCertExpiryStatus │ + │ 4. sendThresholdAlerts ──────►│ per threshold: + │ 5. Create renewal job (if │ a. resolve severity tier + │ issuer registered + ARI │ via AlertSeverityMap + │ allows) │ b. resolve channel set + └──────────────────────────────────┘ via AlertChannels[tier] + c. for each channel: + i. dedup via + notification_events + (cert,threshold,channel) + ii. SendThresholdAlertOnChannel + → notifierRegistry[channel] + → Send(recipient,subj,body) + iii. record audit row + (event_type=expiration_alert_sent, + metadata.channel, + metadata.severity_tier) + iv. bump Prometheus counter + certctl_expiry_alerts_total + {channel,threshold,result} +``` + +The dispatch loop's per-channel error handling is +**fault-isolating**: PagerDuty's failure does NOT skip Slack/Email +at the same threshold. Each channel runs independently, with its +own dedup row + audit row + metric increment. + +--- + +## Configuring the per-policy channel matrix + +The matrix is a property of `RenewalPolicy`. Two new JSONB columns +on the `renewal_policies` table back it (migration 000026): + +- `alert_channels JSONB` — `map[severity_tier][]channel_name`. Default `{}` + → fall through to `DefaultAlertChannels` (Email-only at every tier). +- `alert_severity_map JSONB` — `map[threshold_days]severity_tier`. Default + `{}` → fall through to `DefaultAlertSeverityMap` (`30→informational, + 14→warning, 7→warning, 0→critical`). + +### Example: production-grade routing + +```bash +curl -X PUT https://certctl.example.com/api/v1/renewal-policies/rp-production \ + -H 'Authorization: Bearer ${TOKEN}' \ + -H 'Content-Type: application/json' \ + -d '{ + "name": "Production CDN renewal policy", + "renewal_window_days": 30, + "auto_renew": true, + "max_retries": 3, + "retry_interval_seconds": 300, + "alert_thresholds_days": [30, 14, 7, 0], + "alert_channels": { + "informational": ["Slack"], + "warning": ["Slack", "Email"], + "critical": ["PagerDuty", "OpsGenie", "Email"] + }, + "alert_severity_map": { + "30": "informational", + "14": "warning", + "7": "warning", + "0": "critical" + } + }' +``` + +After this PUT, the next renewal-loop tick that finds a cert under +this policy will fan out alerts as documented above. + +### Example: opt out of informational alerts + +If your team doesn't want T-30 informational alerts (you'd rather +hear about a cert only at warning tier and beyond): + +```json +"alert_channels": { + "informational": [], + "warning": ["Email"], + "critical": ["PagerDuty", "Email"] +} +``` + +The empty `informational` list causes the dispatch loop to record +an `expiration_alert_skipped_no_channels` audit row at T-30 and +skip the dispatch. Other tiers still fire. + +--- + +## Operator playbook + +### "Did the on-call team get paged?" + +```sql +SELECT created_at, + metadata->>'channel' AS channel, + metadata->>'threshold_days' AS threshold, + metadata->>'severity_tier' AS severity +FROM audit_events +WHERE event_type = 'expiration_alert_sent' + AND resource_id = '' +ORDER BY created_at DESC; +``` + +One row per (channel, threshold) attempt. If you see a row with +`channel = 'PagerDuty'` and `severity = 'critical'`, the page went +out (or was at least dispatched to the notifier). + +### "Why didn't I get an alert at T-7?" + +Three places to look: + +1. **Audit log** — `SELECT FROM audit_events WHERE event_type IN + ('expiration_alert_sent','expiration_alert_skipped_no_channels', + 'expiration_alert_skipped_invalid_channel') AND resource_id = + ''`. If `expiration_alert_skipped_no_channels` appears, + your policy's tier list is empty for the resolved tier. If + `expiration_alert_skipped_invalid_channel` appears, your matrix + has a typo (the `metadata->>'invalid_channel'` field tells you + which value). + +2. **Notifications table** — + `SELECT FROM notification_events WHERE certificate_id = '' + AND type = 'ExpirationWarning' ORDER BY created_at DESC`. If + rows exist with `channel = 'Slack'` and `status = 'failed'`, + the dispatch reached the channel but the channel rejected the + send. Look at the `error` column for the upstream message. + +3. **Prometheus counters** — + `curl /api/v1/metrics/prometheus | grep certctl_expiry_alerts_total`. + Sustained `{result="failure"}` counts indicate a notifier + connector misconfiguration (bad webhook URL, expired API key, + etc.). + +### "How do I test the matrix without waiting for a real expiry?" + +certctl ships an admin endpoint for this: + +```bash +curl -X POST https://certctl.example.com/api/v1/admin/notifications/test \ + -H 'Authorization: Bearer ${TOKEN}' \ + -H 'Content-Type: application/json' \ + -d '{ + "certificate_id": "mc-test-cert", + "threshold_days": 0, + "channel": "PagerDuty" + }' +``` + +This calls `NotificationService.SendThresholdAlertOnChannel` +directly and bypasses the renewal loop's threshold check. Useful +for "did I configure PagerDuty correctly?" without having to set +up a deliberately-expiring cert. The admin endpoint requires +`role=admin` (V3-Pro RBAC); V2 deploys gate it on the bearer +token only. + +### "How do I rotate a notifier credential without downtime?" + +1. Update the `CERTCTL_PAGERDUTY_ROUTING_KEY` (or equivalent) env + var in your deployment. +2. Restart `certctl-server`. The notifier registry rebuilds + with the new credential. +3. Confirm with the admin-test endpoint above against the cert + you most care about. + +The renewal loop is idempotent — a missed tick during the restart +window does NOT cause double-dispatch on the next tick (per-channel +dedup on the `notification_events` table guards against that). + +--- + +## Cardinality + cost + +- Default 6 channels × 4 thresholds × 3 results = **72 Prometheus series**. +- Custom-thresholds policies (e.g. `[60, 45, 30, 14, 7, 3, 1, 0]`) + expand the threshold dimension proportionally — 6 × 8 × 3 = 144 series. +- Closed-enum discipline at the dispatch site means typos in + `alert_channels` do NOT grow this count. +- A daily renewal-loop tick over 10K certs each policy-bound to the + matrix above produces O(channels × thresholds × certs) audit rows + + notification rows in the worst case (every cert has crossed + every threshold and no dedup applies). Operators sizing + Postgres should plan for an `audit_events` row count on the + order of `unique_certs × channels_per_critical_tier` per fan-out + batch — which is ~3-5× the pre-Rank-4 row count. + +--- + +## V3-Pro forward path + +Tracked at `cowork/WORKSPACE-ROADMAP.md` under "Adapter hardening": + +- Per-owner / per-team / per-tenant channel routing (the matrix is + per-policy today, not per-owner). +- Calendar-aware suppression (no T-30 alerts on weekends for non- + on-call teams). +- Escalation chains (T-1 unanswered for 30m → escalate to + manager's PagerDuty). +- Per-channel rate limiting (downstream of I-005's retry+DLQ). diff --git a/internal/api/handler/metrics.go b/internal/api/handler/metrics.go index a10e217..9154891 100644 --- a/internal/api/handler/metrics.go +++ b/internal/api/handler/metrics.go @@ -82,6 +82,23 @@ type VaultRenewalSnapshotter interface { SnapshotVaultRenewals() (success, failure, notRenewable uint64) } +// ExpiryAlertSnapshotter is the surface MetricsHandler consumes to +// emit certctl_expiry_alerts_total{channel, threshold, result}. +// *service.ExpiryAlertMetrics satisfies this. Same wiring shape as +// VaultRenewalSnapshotter — one instance shared between recording +// (via NotificationService.SetExpiryAlertMetrics) and exposing +// (here). +// +// Rank 4 of the 2026-05-03 Infisical deep-research deliverable +// (cowork/infisical-deep-research-results.md Part 5). +type ExpiryAlertSnapshotter interface { + // SnapshotExpiryAlerts returns one entry per non-zero counter, + // pre-sorted by (channel, threshold, result) so the Prometheus + // exposition is byte-stable across requests. The handler does + // not re-sort. + SnapshotExpiryAlerts() []service.ExpiryAlertSnapshotEntry +} + // MetricsHandler handles HTTP requests for metrics. // Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format // (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc. @@ -105,6 +122,10 @@ type MetricsHandler struct { // 2026-05-03 issuer-coverage audit. nil disables emission of // certctl_vault_token_renewals_total{result=...}. vaultRenewals VaultRenewalSnapshotter + // Per-policy multi-channel expiry alert counters. Rank 4 of the + // 2026-05-03 Infisical deep-research deliverable. nil disables + // emission of certctl_expiry_alerts_total{channel,threshold,result}. + expiryAlerts ExpiryAlertSnapshotter } // NewMetricsHandler creates a new MetricsHandler with a service dependency. @@ -145,6 +166,14 @@ func (h *MetricsHandler) SetVaultRenewals(c VaultRenewalSnapshotter) { h.vaultRenewals = c } +// SetExpiryAlerts wires the per-policy multi-channel expiry-alert +// counter table for the Prometheus exposition. nil disables the +// block. Closes Rank 4 of the 2026-05-03 Infisical deep-research +// deliverable. +func (h *MetricsHandler) SetExpiryAlerts(c ExpiryAlertSnapshotter) { + h.expiryAlerts = c +} + // MetricsResponse represents the JSON metrics response for V2. type MetricsResponse struct { Gauge MetricsGauge `json:"gauge"` @@ -471,6 +500,26 @@ func (h MetricsHandler) GetPrometheusMetrics(w http.ResponseWriter, r *http.Requ fmt.Fprintf(w, "certctl_vault_token_renewals_total{result=%q} %d\n", "failure", failure) fmt.Fprintf(w, "certctl_vault_token_renewals_total{result=%q} %d\n", "not_renewable", notRenewable) } + + // Per-policy multi-channel expiry-alert counters. Rank 4 of the + // 2026-05-03 Infisical deep-research deliverable. Operators alert + // on certctl_expiry_alerts_total{result="failure"} > 0 to catch + // when a notifier connector (PagerDuty / Slack / etc.) is + // rejecting our sends. Cardinality: 6 channels × N thresholds × 3 + // results — production deploys with the standard 4 thresholds top + // out at 72 series. Snapshot is pre-sorted by the recorder so the + // emission order is byte-stable across requests. + if h.expiryAlerts != nil { + entries := h.expiryAlerts.SnapshotExpiryAlerts() + if len(entries) > 0 { + fmt.Fprintf(w, "\n# HELP certctl_expiry_alerts_total Certificate-expiry alerts dispatched per (channel, threshold, result). result is a closed enum: success, failure, deduped.\n") + fmt.Fprintf(w, "# TYPE certctl_expiry_alerts_total counter\n") + for _, e := range entries { + fmt.Fprintf(w, "certctl_expiry_alerts_total{channel=%q,threshold=%q,result=%q} %d\n", + e.Channel, strconv.Itoa(e.Threshold), e.Result, e.Count) + } + } + } } // formatLE formats a histogram bucket boundary the way Prometheus diff --git a/internal/domain/certificate.go b/internal/domain/certificate.go index 10e4a5d..feec558 100644 --- a/internal/domain/certificate.go +++ b/internal/domain/certificate.go @@ -109,6 +109,37 @@ type RenewalPolicy struct { CertificateProfileID string `json:"certificate_profile_id,omitempty"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` + + // AlertChannels is the per-policy channel-matrix that maps each + // severity tier ("informational" / "warning" / "critical") to the + // set of NotificationChannel values that receive expiry alerts at + // that tier. Values are slices of channel-name strings matching + // the NotificationChannel constants ("Email", "Slack", "Teams", + // "PagerDuty", "OpsGenie", "Webhook"). nil or empty falls back to + // DefaultAlertChannels (Email-only across all tiers, the pre-2026-05-03 + // behaviour preserved as the safe default for operators who have + // not yet opted into multi-channel routing). + // + // Off-enum severity keys or channel values are silently dropped at + // the dispatch site (closed-enum discipline; we do NOT dynamically + // grow Prometheus cardinality on a typo). + // + // Rank 4 of the 2026-05-03 Infisical deep-research deliverable + // (cowork/infisical-deep-research-results.md Part 5). + AlertChannels map[string][]string `json:"alert_channels,omitempty"` + + // AlertSeverityMap maps each threshold-day value to its severity + // tier. Off-map thresholds default to "informational". Operators + // with non-default AlertThresholdsDays values supply their own + // severity mapping; operators on the canonical 30/14/7/0 thresholds + // can leave this empty to inherit DefaultAlertSeverityMap which + // maps: + // + // 30 → informational + // 14 → warning + // 7 → warning + // 0 → critical + AlertSeverityMap map[int]string `json:"alert_severity_map,omitempty"` } // DefaultAlertThresholds returns the standard alert thresholds when none are configured. @@ -123,3 +154,93 @@ func (p *RenewalPolicy) EffectiveAlertThresholds() []int { } return DefaultAlertThresholds() } + +// Severity-tier names for the channel matrix. Closed-enum to keep +// Prometheus cardinality bounded and operator typos surfaceable in +// audit logs (off-enum tier values are dropped at dispatch). +const ( + AlertSeverityInformational = "informational" + AlertSeverityWarning = "warning" + AlertSeverityCritical = "critical" +) + +// DefaultAlertChannels returns the back-compat default channel matrix +// — Email only at every tier. This preserves the pre-2026-05-03 +// behaviour for operators who have not yet opted into multi-channel +// routing. Nil or empty AlertChannels on a RenewalPolicy is read as +// "use this default." +func DefaultAlertChannels() map[string][]string { + return map[string][]string{ + AlertSeverityInformational: {string(NotificationChannelEmail)}, + AlertSeverityWarning: {string(NotificationChannelEmail)}, + AlertSeverityCritical: {string(NotificationChannelEmail)}, + } +} + +// DefaultAlertSeverityMap returns the canonical threshold-to-tier +// mapping for the standard 30/14/7/0 thresholds. Operators with +// custom thresholds supply their own mapping. +func DefaultAlertSeverityMap() map[int]string { + return map[int]string{ + 30: AlertSeverityInformational, + 14: AlertSeverityWarning, + 7: AlertSeverityWarning, + 0: AlertSeverityCritical, + } +} + +// EffectiveAlertChannels returns the configured channel matrix on +// the policy, or the default if unset. Used by the dispatch site in +// RenewalService.sendThresholdAlerts to resolve the channel set for +// a given tier. +// +// A returned map is safe to mutate by the caller — the default-path +// branch returns a fresh map; the configured-path branch returns the +// caller-supplied map (which the caller already owns). +func (p *RenewalPolicy) EffectiveAlertChannels() map[string][]string { + if p == nil || len(p.AlertChannels) == 0 { + return DefaultAlertChannels() + } + return p.AlertChannels +} + +// EffectiveAlertSeverity returns the severity tier for a given +// threshold. Off-map thresholds resolve to "informational" so a +// custom-thresholds policy without an explicit severity map still +// gets dispatch (just at the lowest tier). +func (p *RenewalPolicy) EffectiveAlertSeverity(threshold int) string { + if p != nil { + if tier, ok := p.AlertSeverityMap[threshold]; ok { + return tier + } + } + if tier, ok := DefaultAlertSeverityMap()[threshold]; ok { + return tier + } + return AlertSeverityInformational +} + +// IsValidAlertSeverityTier reports whether tier is one of the closed-enum +// severity values. Used by the policy validation path in +// service.RenewalPolicyService to reject typos at write time. +func IsValidAlertSeverityTier(tier string) bool { + switch tier { + case AlertSeverityInformational, AlertSeverityWarning, AlertSeverityCritical: + return true + } + return false +} + +// IsValidNotificationChannel reports whether channel is one of the +// closed-enum NotificationChannel values. Used by the policy +// validation path to reject typos at write time AND by the dispatch +// site to defensively drop off-enum values that survived a migration. +func IsValidNotificationChannel(channel string) bool { + switch NotificationChannel(channel) { + case NotificationChannelEmail, NotificationChannelWebhook, + NotificationChannelSlack, NotificationChannelTeams, + NotificationChannelPagerDuty, NotificationChannelOpsGenie: + return true + } + return false +} diff --git a/internal/repository/postgres/renewal_policy.go b/internal/repository/postgres/renewal_policy.go index e96bfe3..506ff19 100644 --- a/internal/repository/postgres/renewal_policy.go +++ b/internal/repository/postgres/renewal_policy.go @@ -34,24 +34,43 @@ func NewRenewalPolicyRepository(db *sql.DB) *RenewalPolicyRepository { // pre-existing drift is out of G-1's minimum-viable-delta and is tracked in // the design doc §8. Introducing them would change struct shapes / JSON tags // and require domain-layer churn we're not taking on in this change. +// +// alert_channels / alert_severity_map (migration 000026) ARE read here — +// they're the per-policy channel matrix that drives multi-channel expiry +// alert routing (Rank 4 of the 2026-05-03 Infisical deep-research +// deliverable). Both default to '{}' at the DB level; scanRenewalPolicy +// unmarshals an empty map into nil so domain.EffectiveAlertChannels / +// EffectiveAlertSeverityMap fall through to the back-compat defaults. const renewalPolicyColumns = ` id, name, renewal_window_days, auto_renew, max_retries, - retry_interval_seconds, alert_thresholds_days, created_at, updated_at + retry_interval_seconds, alert_thresholds_days, + alert_channels, alert_severity_map, + created_at, updated_at ` // scanRenewalPolicy decodes one renewal_policies row from a Row or Rows // scanner, unmarshaling alert_thresholds_days JSONB into the domain slice. // Malformed JSONB silently falls back to DefaultAlertThresholds() — same // behavior as the pre-G-1 code so we don't change observable semantics. +// +// alert_channels + alert_severity_map (migration 000026) follow the same +// "malformed → fall through to default" rule. The default-fallthrough +// happens at read time in domain.EffectiveAlertChannels / +// EffectiveAlertSeverity, so populating these fields with nil on parse +// failure is the correct shape — the runtime still gets the back-compat +// Email-only matrix. func scanRenewalPolicy(scanner interface { Scan(dest ...any) error }) (*domain.RenewalPolicy, error) { var policy domain.RenewalPolicy var thresholdsJSON []byte + var channelsJSON []byte + var severityJSON []byte if err := scanner.Scan( &policy.ID, &policy.Name, &policy.RenewalWindowDays, &policy.AutoRenew, &policy.MaxRetries, &policy.RetryInterval, &thresholdsJSON, + &channelsJSON, &severityJSON, &policy.CreatedAt, &policy.UpdatedAt, ); err != nil { return nil, err @@ -63,9 +82,56 @@ func scanRenewalPolicy(scanner interface { } } + if len(channelsJSON) > 0 && string(channelsJSON) != "{}" { + if err := json.Unmarshal(channelsJSON, &policy.AlertChannels); err != nil { + policy.AlertChannels = nil // EffectiveAlertChannels falls through to default + } + } + + if len(severityJSON) > 0 && string(severityJSON) != "{}" { + // JSONB stores int keys as string; unmarshal via a string-keyed map + // then convert. JSON does not support non-string object keys, so + // the wire representation is e.g. {"30":"informational"}. + stringKeyed := map[string]string{} + if err := json.Unmarshal(severityJSON, &stringKeyed); err == nil { + converted := make(map[int]string, len(stringKeyed)) + for k, v := range stringKeyed { + var threshold int + if _, scanErr := fmt.Sscanf(k, "%d", &threshold); scanErr == nil { + converted[threshold] = v + } + } + policy.AlertSeverityMap = converted + } + } + return &policy, nil } +// marshalSeverityMap converts the domain's int-keyed map into the +// string-keyed form Postgres JSONB stores. Mirror of the inverse +// conversion in scanRenewalPolicy. Returns "{}" for nil/empty maps so +// the DB never sees null where NOT NULL is required. +func marshalSeverityMap(m map[int]string) ([]byte, error) { + if len(m) == 0 { + return []byte("{}"), nil + } + stringKeyed := make(map[string]string, len(m)) + for k, v := range m { + stringKeyed[fmt.Sprintf("%d", k)] = v + } + return json.Marshal(stringKeyed) +} + +// marshalAlertChannels marshals the channel matrix as JSONB. nil/empty +// returns "{}" so the DB NOT NULL constraint is satisfied. +func marshalAlertChannels(m map[string][]string) ([]byte, error) { + if len(m) == 0 { + return []byte("{}"), nil + } + return json.Marshal(m) +} + // Get retrieves a renewal policy by ID. func (r *RenewalPolicyRepository) Get(ctx context.Context, id string) (*domain.RenewalPolicy, error) { row := r.db.QueryRowContext(ctx, `SELECT `+renewalPolicyColumns+` FROM renewal_policies WHERE id = $1`, id) @@ -158,6 +224,16 @@ func (r *RenewalPolicyRepository) Create(ctx context.Context, policy *domain.Ren return fmt.Errorf("failed to marshal alert thresholds: %w", err) } + channelsJSON, err := marshalAlertChannels(policy.AlertChannels) + if err != nil { + return fmt.Errorf("failed to marshal alert channels: %w", err) + } + + severityJSON, err := marshalSeverityMap(policy.AlertSeverityMap) + if err != nil { + return fmt.Errorf("failed to marshal alert severity map: %w", err) + } + // ID auto-generation with collision retry. We attempt up to 10 suffix // variants (rp-foo, rp-foo-2, ..., rp-foo-10) before giving up — the // 23505 error the caller gets back past that point is on Name (their @@ -170,8 +246,10 @@ func (r *RenewalPolicyRepository) Create(ctx context.Context, policy *domain.Ren insertSQL := ` INSERT INTO renewal_policies ( id, name, renewal_window_days, auto_renew, max_retries, - retry_interval_seconds, alert_thresholds_days, created_at, updated_at - ) VALUES ($1, $2, $3, $4, $5, $6, $7, NOW(), NOW()) + retry_interval_seconds, alert_thresholds_days, + alert_channels, alert_severity_map, + created_at, updated_at + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, NOW(), NOW()) RETURNING ` + renewalPolicyColumns maxAttempts := 10 @@ -189,6 +267,7 @@ func (r *RenewalPolicyRepository) Create(ctx context.Context, policy *domain.Ren row := r.db.QueryRowContext(ctx, insertSQL, candidateID, policy.Name, policy.RenewalWindowDays, policy.AutoRenew, policy.MaxRetries, policy.RetryInterval, thresholdsJSON, + channelsJSON, severityJSON, ) inserted, scanErr := scanRenewalPolicy(row) @@ -234,6 +313,16 @@ func (r *RenewalPolicyRepository) Update(ctx context.Context, id string, policy return fmt.Errorf("failed to marshal alert thresholds: %w", err) } + channelsJSON, err := marshalAlertChannels(policy.AlertChannels) + if err != nil { + return fmt.Errorf("failed to marshal alert channels: %w", err) + } + + severityJSON, err := marshalSeverityMap(policy.AlertSeverityMap) + if err != nil { + return fmt.Errorf("failed to marshal alert severity map: %w", err) + } + row := r.db.QueryRowContext(ctx, ` UPDATE renewal_policies SET name = $2, @@ -242,11 +331,14 @@ func (r *RenewalPolicyRepository) Update(ctx context.Context, id string, policy max_retries = $5, retry_interval_seconds = $6, alert_thresholds_days = $7, + alert_channels = $8, + alert_severity_map = $9, updated_at = NOW() WHERE id = $1 RETURNING `+renewalPolicyColumns, id, policy.Name, policy.RenewalWindowDays, policy.AutoRenew, policy.MaxRetries, policy.RetryInterval, thresholdsJSON, + channelsJSON, severityJSON, ) updated, err := scanRenewalPolicy(row) diff --git a/internal/service/expiry_alert_metrics.go b/internal/service/expiry_alert_metrics.go new file mode 100644 index 0000000..d4356a0 --- /dev/null +++ b/internal/service/expiry_alert_metrics.go @@ -0,0 +1,161 @@ +package service + +import ( + "sort" + "sync" + "sync/atomic" +) + +// ExpiryAlertMetrics is a thread-safe counter table for the per-policy +// multi-channel expiry-alert dispatch path. Rank 4 of the 2026-05-03 +// Infisical deep-research deliverable +// (cowork/infisical-deep-research-results.md Part 5). Closes the +// procurement-checklist gap where a customer who configured PagerDuty +// for cert-expiry pages got silent nothing — ExpirationWarning shipped +// only to Email pre-fix. +// +// Dimensions: +// +// channel — closed-enum NotificationChannel value (Email, Slack, +// Teams, PagerDuty, OpsGenie, Webhook). Off-enum +// channels are silently dropped at the dispatch site +// BEFORE this counter sees them, so cardinality stays +// bounded. +// threshold — int days-until-expiry the alert fired for (e.g. 30, +// 14, 7, 0). Custom-thresholds policies can grow this +// dimension; production deploys with the standard 4 +// thresholds give 4 distinct values. +// result — closed enum: +// "success" — the channel's notifier accepted the +// send. (Underlying delivery may still +// fail if e.g. SMTP queue is broken; +// those failures surface via the +// existing I-005 retry/DLQ machinery.) +// "failure" — the channel's notifier returned an +// error, OR the notification row failed +// to persist. Operators alert on +// sustained {result="failure"} > 0. +// "deduped" — a prior (cert, threshold, channel) +// notification was already in +// persistence; today's loop skipped the +// send. Useful for detecting +// "everything is healthy and steady- +// state" — high deduped counts mean +// the daily loop is doing its job. +// +// Cardinality bound: 6 channels × 4 thresholds × 3 results = 72 series. +// A custom-thresholds policy can grow this; bound is operator-controlled. +// +// Wiring: cmd/server/main.go constructs ONE instance of +// *ExpiryAlertMetrics, calls notificationService.SetExpiryAlertMetrics +// to register the recording side, AND +// metricsHandler.SetExpiryAlerts to register the exposing side. +// Mirror of the VaultRenewalMetrics shape from the 2026-05-03 +// audit fix #5 (commit `ceca364`) for operator-symmetry — same +// snapshot interface, same atomic-counters-under-RW-mutex pattern. +type ExpiryAlertMetrics struct { + mu sync.RWMutex + counters map[expiryAlertKey]*atomic.Uint64 +} + +type expiryAlertKey struct { + Channel string + Threshold int + Result string +} + +// NewExpiryAlertMetrics constructs a fresh ExpiryAlertMetrics with all +// counters at zero. Pass to NotificationService.SetExpiryAlertMetrics +// (recording side) and MetricsHandler.SetExpiryAlerts (exposing side). +func NewExpiryAlertMetrics() *ExpiryAlertMetrics { + return &ExpiryAlertMetrics{ + counters: make(map[expiryAlertKey]*atomic.Uint64), + } +} + +// RecordExpiryAlert bumps the (channel, threshold, result) counter. +// Implements service.ExpiryAlertRecorder (from notification.go) so +// NotificationService can call this on every dispatch outcome without +// importing the metrics package. +// +// Off-enum result values silently no-op (closed-enum discipline; we +// don't dynamic-cardinality-grow the Prometheus exposition on a +// caller typo). +func (m *ExpiryAlertMetrics) RecordExpiryAlert(channel string, threshold int, result string) { + if m == nil { + return + } + switch result { + case "success", "failure", "deduped": + // ok + default: + return + } + + key := expiryAlertKey{Channel: channel, Threshold: threshold, Result: result} + + m.mu.RLock() + c, ok := m.counters[key] + m.mu.RUnlock() + if ok { + c.Add(1) + return + } + + m.mu.Lock() + if c, ok := m.counters[key]; ok { + // Lost the race; another goroutine inserted while we were + // upgrading the lock. + m.mu.Unlock() + c.Add(1) + return + } + c = &atomic.Uint64{} + c.Add(1) + m.counters[key] = c + m.mu.Unlock() +} + +// ExpiryAlertSnapshotEntry is one row in the snapshot result. The +// Prometheus exposer iterates these to produce the +// certctl_expiry_alerts_total{channel, threshold, result} series. +type ExpiryAlertSnapshotEntry struct { + Channel string + Threshold int + Result string + Count uint64 +} + +// SnapshotExpiryAlerts returns a point-in-time read of every +// (channel, threshold, result) counter. The slice is sorted by +// (channel, threshold, result) so the Prometheus exposition is +// stable across requests. +// +// Implements handler.ExpiryAlertSnapshotter for the metrics emitter. +func (m *ExpiryAlertMetrics) SnapshotExpiryAlerts() []ExpiryAlertSnapshotEntry { + if m == nil { + return nil + } + m.mu.RLock() + defer m.mu.RUnlock() + + out := make([]ExpiryAlertSnapshotEntry, 0, len(m.counters)) + for k, v := range m.counters { + out = append(out, ExpiryAlertSnapshotEntry{ + Channel: k.Channel, + Threshold: k.Threshold, + Result: k.Result, + Count: v.Load(), + }) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Channel != out[j].Channel { + return out[i].Channel < out[j].Channel + } + if out[i].Threshold != out[j].Threshold { + return out[i].Threshold < out[j].Threshold + } + return out[i].Result < out[j].Result + }) + return out +} diff --git a/internal/service/notification.go b/internal/service/notification.go index cee1b01..20d632d 100644 --- a/internal/service/notification.go +++ b/internal/service/notification.go @@ -49,6 +49,53 @@ type NotificationService struct { notifRepo repository.NotificationRepository ownerRepo repository.OwnerRepository notifierRegistry map[string]Notifier + + // expiryAlertMetrics — when set via SetExpiryAlertMetrics, every call + // to SendThresholdAlertOnChannel reports its outcome (success / failure) + // to the metric sink so the Prometheus exposer surfaces + // certctl_expiry_alerts_total{channel,threshold,result}. Rank 4 of the + // 2026-05-03 Infisical deep-research deliverable. Nil leaves the + // dispatch path unchanged (no metric emission, but alerts still fire). + expiryAlertMetrics ExpiryAlertRecorder +} + +// ExpiryAlertRecorder is the metric-sink surface SendThresholdAlertOnChannel +// uses. result is one of: "success", "failure", "deduped". Implementations +// MUST be goroutine-safe — RecordExpiryAlert is called from the renewal +// loop's own goroutine on every threshold-channel tick. +// +// service.ExpiryAlertMetrics satisfies this interface. cmd/server wires +// the same instance into the service (recording side) and into +// MetricsHandler (exposing side, for the Prometheus emitter). +type ExpiryAlertRecorder interface { + RecordExpiryAlert(channel string, threshold int, result string) +} + +// SetExpiryAlertMetrics wires the per-(channel, threshold, result) counter +// table for expiry-alert dispatch. Pass nil to disable recording. Safe to +// call before any SendThresholdAlertOnChannel call; calling later just +// means earlier calls didn't increment the counters. +func (s *NotificationService) SetExpiryAlertMetrics(r ExpiryAlertRecorder) { + s.expiryAlertMetrics = r +} + +// recordExpiryAlert is the internal hook used by SendThresholdAlertOnChannel +// to report per-(channel, threshold, result) counts. Nil-safe. +func (s *NotificationService) recordExpiryAlert(channel string, threshold int, result string) { + if s == nil || s.expiryAlertMetrics == nil { + return + } + s.expiryAlertMetrics.RecordExpiryAlert(channel, threshold, result) +} + +// RecordExpiryAlertDeduped is the public hook RenewalService uses to report +// (channel, threshold, "deduped") — dedup happens before +// SendThresholdAlertOnChannel runs, so the call site is in the caller, not +// the dispatch helper. Kept on NotificationService rather than exposed on +// the recorder directly so callers don't need to know whether the recorder +// is wired. +func (s *NotificationService) RecordExpiryAlertDeduped(channel string, threshold int) { + s.recordExpiryAlert(channel, threshold, "deduped") } // Notifier defines the interface for notification channels (email, Slack, webhooks, etc.). @@ -94,9 +141,48 @@ func (s *NotificationService) SendExpirationWarning(ctx context.Context, cert *d return s.SendThresholdAlert(ctx, cert, daysUntilExpiry, daysUntilExpiry) } -// SendThresholdAlert sends an expiration alert for a specific threshold (e.g., 30-day, 14-day, expired). -// The threshold parameter indicates which configured threshold triggered the alert. +// SendThresholdAlert sends an expiration alert for a specific threshold via +// the Email channel. Preserved for backwards-compat with non-policy callers +// (admin "send test alert" surfaces in the GUI, etc.); equivalent to +// SendThresholdAlertOnChannel(ctx, cert, days, threshold, +// domain.NotificationChannelEmail). +// +// Policy-driven dispatch in RenewalService.sendThresholdAlerts uses +// SendThresholdAlertOnChannel directly with the channel resolved from the +// per-policy AlertChannels matrix. Rank 4 of the 2026-05-03 Infisical +// deep-research deliverable. func (s *NotificationService) SendThresholdAlert(ctx context.Context, cert *domain.ManagedCertificate, daysUntilExpiry int, threshold int) error { + return s.SendThresholdAlertOnChannel(ctx, cert, daysUntilExpiry, threshold, domain.NotificationChannelEmail) +} + +// SendThresholdAlertOnChannel sends an expiration alert for a specific +// (cert, threshold, channel) triple. The channel must be one of the +// closed-enum NotificationChannel values; off-enum channels surface as a +// failure metric increment + ERROR log + a wrapped error so the caller can +// react (typically: log and continue with the next channel in the +// policy's tier list — see RenewalService.sendThresholdAlerts). +// +// The notification record is persisted with the channel field set to the +// requested value, and the message body carries the [threshold:N] tag for +// dedup at HasThresholdNotification's substring filter. Combined with the +// repository.NotificationFilter.Channel field, this gives us per-(cert, +// threshold, channel) dedup so a transient PagerDuty 5xx today does NOT +// suppress today's Slack delivery and tomorrow's PagerDuty retry will +// still fire. +// +// Result is reported to expiryAlertMetrics (when wired): "success" on +// successful send, "failure" on send error or persistence error. +// "deduped" results are reported by the caller (sendThresholdAlerts) since +// dedup happens before this method runs. +func (s *NotificationService) SendThresholdAlertOnChannel( + ctx context.Context, cert *domain.ManagedCertificate, daysUntilExpiry int, + threshold int, channel domain.NotificationChannel, +) error { + if !domain.IsValidNotificationChannel(string(channel)) { + s.recordExpiryAlert(string(channel), threshold, "failure") + return fmt.Errorf("invalid notification channel %q for threshold %d", channel, threshold) + } + var body string if threshold <= 0 { body = fmt.Sprintf( @@ -110,12 +196,11 @@ func (s *NotificationService) SendThresholdAlert(ctx context.Context, cert *doma ) } - // Create notification record — resolve owner email if possible notif := &domain.NotificationEvent{ ID: generateID("notif"), CertificateID: &cert.ID, Type: domain.NotificationTypeExpirationWarning, - Channel: domain.NotificationChannelEmail, + Channel: channel, Recipient: s.resolveRecipient(ctx, cert.OwnerID), Message: body, Status: "pending", @@ -123,20 +208,52 @@ func (s *NotificationService) SendThresholdAlert(ctx context.Context, cert *doma } if err := s.notifRepo.Create(ctx, notif); err != nil { + s.recordExpiryAlert(string(channel), threshold, "failure") return fmt.Errorf("failed to create notification: %w", err) } - // Attempt immediate send - return s.sendNotification(ctx, notif) + if err := s.sendNotification(ctx, notif); err != nil { + s.recordExpiryAlert(string(channel), threshold, "failure") + return err + } + s.recordExpiryAlert(string(channel), threshold, "success") + return nil } -// HasThresholdNotification checks whether an expiration warning has already been sent -// for a specific certificate and threshold combination. Used for deduplication. +// HasThresholdNotification checks whether an expiration warning has already +// been sent for a specific (cert, threshold) pair via the Email channel. +// Preserved for backwards-compat. Equivalent to +// HasThresholdNotificationOnChannel(ctx, certID, threshold, "Email"). +// +// New callers driven by the per-policy channel matrix should use +// HasThresholdNotificationOnChannel directly with the explicit channel — +// see RenewalService.sendThresholdAlerts. func (s *NotificationService) HasThresholdNotification(ctx context.Context, certID string, threshold int) (bool, error) { + return s.HasThresholdNotificationOnChannel(ctx, certID, threshold, domain.NotificationChannelEmail) +} + +// HasThresholdNotificationOnChannel reports whether an ExpirationWarning +// notification has already been persisted for a specific (cert, threshold, +// channel) triple. Used to dedupe per-channel fan-out so a successful +// PagerDuty page today doesn't fire again tomorrow when the renewal loop +// re-checks the same threshold (and so a transient PagerDuty 5xx today +// doesn't suppress tomorrow's successful retry). +// +// The match is on the substring "[threshold:N]" in the stored message body +// (the same dedup pattern used by HasThresholdNotification pre-2026-05-03) +// AND the channel column. Both filters apply; a match requires both. +// +// channel == "" preserves the legacy (cert, threshold) dedup for the same +// reason HasThresholdNotification kept its old shape — admin-surface +// callers still need that behaviour. +func (s *NotificationService) HasThresholdNotificationOnChannel( + ctx context.Context, certID string, threshold int, channel domain.NotificationChannel, +) (bool, error) { filter := &repository.NotificationFilter{ CertificateID: certID, Type: string(domain.NotificationTypeExpirationWarning), MessageLike: fmt.Sprintf("%%[threshold:%d]%%", threshold), + Channel: string(channel), PerPage: 1, } diff --git a/internal/service/renewal.go b/internal/service/renewal.go index e52eeef..adba774 100644 --- a/internal/service/renewal.go +++ b/internal/service/renewal.go @@ -200,8 +200,16 @@ func (s *RenewalService) CheckExpiringCertificates(ctx context.Context) error { // Update certificate status based on expiry s.updateCertExpiryStatus(ctx, cert, daysUntil) - // Send threshold-based alerts with deduplication - s.sendThresholdAlerts(ctx, cert, int(daysUntil), thresholds) + // Send threshold-based alerts with per-channel deduplication. The + // policy pointer (nil-safe) drives the per-(threshold) channel + // matrix; nil policy or empty AlertChannels falls through to the + // back-compat Email-only default. Rank 4 of the 2026-05-03 + // Infisical deep-research deliverable. + var policyPtr *domain.RenewalPolicy + if cert.RenewalPolicyID != "" { + policyPtr = policyCache[cert.RenewalPolicyID] + } + s.sendThresholdAlerts(ctx, cert, int(daysUntil), thresholds, policyPtr) // Only create renewal job if an issuer connector is registered for this cert's issuer connector, hasIssuer := s.issuerRegistry.Get(cert.IssuerID) @@ -289,40 +297,138 @@ func (s *RenewalService) CheckExpiringCertificates(ctx context.Context) error { return nil } -// sendThresholdAlerts sends deduplicated expiration notifications based on configured thresholds. -// For each threshold that the certificate has crossed (e.g., ≤30 days, ≤14 days), it checks -// whether a notification for that threshold was already sent. Only new threshold crossings -// trigger notifications. -func (s *RenewalService) sendThresholdAlerts(ctx context.Context, cert *domain.ManagedCertificate, daysUntil int, thresholds []int) { +// sendThresholdAlerts sends deduplicated expiration notifications based on +// configured thresholds AND the per-policy channel matrix. For each +// threshold that the certificate has crossed (e.g., ≤30 days, ≤14 days), +// the dispatch loop: +// +// 1. Resolves the threshold's severity tier from the policy's +// AlertSeverityMap (or DefaultAlertSeverityMap if unset / off-map). +// 2. Looks up the channel set for that tier in the policy's AlertChannels +// (or DefaultAlertChannels — Email-only — if unset / empty). +// 3. For each resolved channel, defensively re-validates against the +// closed-enum NotificationChannel set (off-enum values silently drop +// with an audit row so an operator can grep + fix the typo without +// us silently dynamic-cardinality-growing the Prometheus counter). +// 4. Per-(cert, threshold, channel) dedup via +// HasThresholdNotificationOnChannel — a successful PagerDuty page +// yesterday won't fire again today, but a transient PagerDuty 5xx +// today does NOT suppress today's Slack and tomorrow's PagerDuty +// retry will still fire (the failed row stays "failed" in the DB, +// not "sent"). +// 5. SendThresholdAlertOnChannel persists the notification row (channel +// column populated), reports the metric, and dispatches. +// 6. Per-channel audit row so an operator can SQL-grep +// audit_events WHERE event_type='expiration_alert_sent' +// AND metadata->>'channel' = 'PagerDuty' to answer "did the on-call +// team get paged?". +// +// Rank 4 of the 2026-05-03 Infisical deep-research deliverable +// (cowork/infisical-deep-research-results.md Part 5). The policy +// argument is nil-safe — a cert with no RenewalPolicy attached gets the +// back-compat Email-only default matrix. +func (s *RenewalService) sendThresholdAlerts( + ctx context.Context, cert *domain.ManagedCertificate, daysUntil int, + thresholds []int, policy *domain.RenewalPolicy, +) { + channelMatrix := domain.DefaultAlertChannels() + if policy != nil { + channelMatrix = policy.EffectiveAlertChannels() + } + for _, threshold := range thresholds { // Only alert if the cert has crossed this threshold (days remaining ≤ threshold) if daysUntil > threshold { continue } - // Check if we already sent a notification for this threshold (deduplication) - alreadySent, err := s.notificationSvc.HasThresholdNotification(ctx, cert.ID, threshold) - if err != nil { - slog.Error("failed to check notification dedup", "cert_id", cert.ID, "threshold", threshold, "error", err) - continue + tier := domain.AlertSeverityInformational + if policy != nil { + tier = policy.EffectiveAlertSeverity(threshold) + } else if t, ok := domain.DefaultAlertSeverityMap()[threshold]; ok { + tier = t } - if alreadySent { + + // Defensive: an unknown tier (operator typo that survived + // validation, or a future tier name added in a later schema) + // drops to "informational" so we still alert on SOMETHING + // rather than silently swallowing the threshold. + if !domain.IsValidAlertSeverityTier(tier) { + tier = domain.AlertSeverityInformational + } + + channels := channelMatrix[tier] + if len(channels) == 0 { + // Operator opted out of this tier (or matrix has no entry + // for the tier). Skip silently — record-empty audit row to + // surface the opt-out in the audit log. + _ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem, + "expiration_alert_skipped_no_channels", "certificate", cert.ID, + map[string]interface{}{ + "threshold_days": threshold, + "days_until_expiry": daysUntil, + "severity_tier": tier, + }) continue } - // Send the threshold alert - if err := s.notificationSvc.SendThresholdAlert(ctx, cert, daysUntil, threshold); err != nil { - slog.Error("failed to send threshold alert for cert", "cert_id", cert.ID, "threshold", threshold, "error", err) - } + for _, ch := range channels { + // Defensive validation: the policy validation path rejects + // off-enum values at write time, but a stored row could + // drift across a schema change. Drop off-enum values here + // rather than letting them through to a dispatch site that + // would either fail the Send call or grow Prometheus + // cardinality. Audit the drop so operators see the typo. + if !domain.IsValidNotificationChannel(ch) { + _ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem, + "expiration_alert_skipped_invalid_channel", "certificate", cert.ID, + map[string]interface{}{ + "threshold_days": threshold, + "days_until_expiry": daysUntil, + "severity_tier": tier, + "invalid_channel": ch, + }) + continue + } - // Record audit event for the alert - if auditErr := s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem, - "expiration_alert_sent", "certificate", cert.ID, - map[string]interface{}{ - "threshold_days": threshold, - "days_until_expiry": daysUntil, - }); auditErr != nil { - slog.Error("failed to record audit event", "error", auditErr) + channel := domain.NotificationChannel(ch) + alreadySent, err := s.notificationSvc.HasThresholdNotificationOnChannel( + ctx, cert.ID, threshold, channel, + ) + if err != nil { + slog.Error("failed to check notification dedup", + "cert_id", cert.ID, "threshold", threshold, + "channel", ch, "error", err) + continue + } + if alreadySent { + s.notificationSvc.RecordExpiryAlertDeduped(ch, threshold) + continue + } + + if err := s.notificationSvc.SendThresholdAlertOnChannel( + ctx, cert, daysUntil, threshold, channel, + ); err != nil { + slog.Error("failed to send threshold alert", + "cert_id", cert.ID, "threshold", threshold, + "channel", ch, "error", err) + // continue — other channels still fire + } + + // Per-(cert, threshold, channel) audit row. Operators alert + // on the channel-labelled row to confirm a specific pager + // went out. + if auditErr := s.auditService.RecordEvent(ctx, "system", + domain.ActorTypeSystem, "expiration_alert_sent", + "certificate", cert.ID, + map[string]interface{}{ + "threshold_days": threshold, + "days_until_expiry": daysUntil, + "channel": ch, + "severity_tier": tier, + }); auditErr != nil { + slog.Error("failed to record audit event", "error", auditErr) + } } } } diff --git a/internal/service/renewal_expiry_alerts_test.go b/internal/service/renewal_expiry_alerts_test.go new file mode 100644 index 0000000..e0bd806 --- /dev/null +++ b/internal/service/renewal_expiry_alerts_test.go @@ -0,0 +1,654 @@ +package service + +// Rank 4 of the 2026-05-03 Infisical deep-research deliverable +// (cowork/infisical-deep-research-results.md Part 5). Pins every leg of +// the per-policy multi-channel expiry-alert fan-out matrix: +// +// 1. Default matrix → Email-only at every tier (back-compat). +// 2. Per-tier fan-out — informational/warning/critical each route to +// a different channel set; cert at 0 days remaining crosses all +// four canonical thresholds; assert the exact recipient calls per +// channel. +// 3. Per-(cert, threshold, channel) dedup — second loop tick produces +// zero sends; deduped counter increments instead. +// 4. One-channel fails → others still fire; failure metric increments; +// success metric increments for the channels that succeeded. +// 5. Off-enum channel typo dropped at dispatch + audit-row trail. +// 6. Metric counter increments for every (channel, threshold, result) +// combination the loop produces. +// 7. Nil policy → default matrix (cert with no RenewalPolicy +// attached). +// 8. Operator opt-out of a tier (empty list) — that tier fires zero +// alerts; other tiers unaffected. + +import ( + "context" + "errors" + "log/slog" + "sync" + "testing" + "time" + + "github.com/shankar0123/certctl/internal/domain" +) + +// channelMockNotifier records (recipient, subject, body) per Send call. +// Replaces the simple mockNotifier from testutil_test.go for tests that +// need to verify which channel got which message — channelMockNotifier +// stamps every recorded message with its channel name so tests can +// distinguish Slack-vs-PagerDuty-vs-Email after a single fan-out. +type channelMockNotifier struct { + mu sync.Mutex + channel string + messages []channelNotifierMsg + sendErr error +} + +type channelNotifierMsg struct { + Channel string + Recipient string + Subject string + Body string +} + +func newChannelMockNotifier(channel string) *channelMockNotifier { + return &channelMockNotifier{channel: channel} +} + +func (m *channelMockNotifier) Send(ctx context.Context, recipient string, subject string, body string) error { + m.mu.Lock() + defer m.mu.Unlock() + if m.sendErr != nil { + return m.sendErr + } + m.messages = append(m.messages, channelNotifierMsg{ + Channel: m.channel, + Recipient: recipient, + Subject: subject, + Body: body, + }) + return nil +} + +func (m *channelMockNotifier) Channel() string { return m.channel } + +func (m *channelMockNotifier) count() int { + m.mu.Lock() + defer m.mu.Unlock() + return len(m.messages) +} + +// matrixFixture wires the full set of objects each per-tier-matrix test +// needs — six channel-aware notifiers, the metric recorder, the +// notification service, and the renewal service. Tests vary only the +// policy and the cert. +type matrixFixture struct { + notifSvc *NotificationService + metrics *ExpiryAlertMetrics + rs *RenewalService + notifs map[string]*channelMockNotifier + notifRepo *mockNotifRepo + policyRepo *mockRenewalPolicyRepo + certRepo *mockCertRepo + auditRepo *mockAuditRepo +} + +func newMatrixFixture(t *testing.T) *matrixFixture { + t.Helper() + + notifs := map[string]*channelMockNotifier{ + "Email": newChannelMockNotifier("Email"), + "Slack": newChannelMockNotifier("Slack"), + "Teams": newChannelMockNotifier("Teams"), + "PagerDuty": newChannelMockNotifier("PagerDuty"), + "OpsGenie": newChannelMockNotifier("OpsGenie"), + "Webhook": newChannelMockNotifier("Webhook"), + } + + registry := map[string]Notifier{} + for k, n := range notifs { + registry[k] = n + } + + notifRepo := newMockNotificationRepository() + notifSvc := NewNotificationService(notifRepo, registry) + metrics := NewExpiryAlertMetrics() + notifSvc.SetExpiryAlertMetrics(metrics) + + certRepo := newMockCertificateRepository() + jobRepo := newMockJobRepository() + policyRepo := newMockRenewalPolicyRepository() + auditRepo := newMockAuditRepository() + auditSvc := NewAuditService(auditRepo) + + issuerRegistry := NewIssuerRegistry(slog.Default()) + issuerRegistry.Set("iss-test", &mockIssuerConnector{}) + + rs := NewRenewalService(certRepo, jobRepo, policyRepo, nil, auditSvc, notifSvc, issuerRegistry, "server") + + return &matrixFixture{ + notifSvc: notifSvc, + metrics: metrics, + rs: rs, + notifs: notifs, + notifRepo: notifRepo, + policyRepo: policyRepo, + certRepo: certRepo, + auditRepo: auditRepo, + } +} + +func newExpiringCert(id string, daysFromNow int, policyID string) *domain.ManagedCertificate { + return &domain.ManagedCertificate{ + ID: id, + Name: "Test Cert " + id, + CommonName: id + ".example.com", + SANs: []string{}, + OwnerID: "owner-1", + TeamID: "team-1", + IssuerID: "iss-test", + RenewalPolicyID: policyID, + Status: domain.CertificateStatusActive, + ExpiresAt: time.Now().AddDate(0, 0, daysFromNow), + Tags: map[string]string{}, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } +} + +// totalEntries sums Count across every snapshot entry that matches the +// given filter func. Useful for "all-success", "all-failure" assertions +// without listing every (channel, threshold) tuple. +func totalEntries(metrics *ExpiryAlertMetrics, want func(ExpiryAlertSnapshotEntry) bool) uint64 { + var sum uint64 + for _, e := range metrics.SnapshotExpiryAlerts() { + if want(e) { + sum += e.Count + } + } + return sum +} + +// TestExpiryAlerts_DefaultMatrix_EmailOnly pins the back-compat +// contract: a policy with no AlertChannels matrix → the runtime falls +// through to DefaultAlertChannels (Email-only at every tier). +// PagerDuty / Slack / Teams / OpsGenie / Webhook receive ZERO alerts +// regardless of how many thresholds the cert has crossed. +func TestExpiryAlerts_DefaultMatrix_EmailOnly(t *testing.T) { + ctx := context.Background() + f := newMatrixFixture(t) + + // Policy with no AlertChannels — fall through to default. + policy := &domain.RenewalPolicy{ + ID: "rp-default-matrix", + Name: "Default Matrix", + RenewalWindowDays: 30, + AutoRenew: true, + MaxRetries: 3, + RetryInterval: 300, + AlertThresholdsDays: []int{30, 14, 7, 0}, + // AlertChannels intentionally nil + // AlertSeverityMap intentionally nil + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + f.policyRepo.AddPolicy(policy) + + cert := newExpiringCert("mc-default", 0, "rp-default-matrix") + f.certRepo.AddCert(cert) + + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("CheckExpiringCertificates: %v", err) + } + + if got := f.notifs["Email"].count(); got != 4 { + t.Errorf("expected 4 Email alerts (one per threshold), got %d", got) + } + for _, ch := range []string{"Slack", "Teams", "PagerDuty", "OpsGenie", "Webhook"} { + if got := f.notifs[ch].count(); got != 0 { + t.Errorf("expected 0 %s alerts in default-matrix mode, got %d", ch, got) + } + } +} + +// TestExpiryAlerts_PerTierFanOut pins the operator-supplied matrix: +// +// informational → [Slack] +// warning → [Slack, Email] +// critical → [PagerDuty, OpsGenie, Email] +// +// With the canonical 30/14/7/0 thresholds and a cert at 0 days +// remaining (crosses all four), the dispatch loop should produce: +// +// Slack: 3 (informational T-30, warning T-14, warning T-7) +// Email: 3 (warning T-14, warning T-7, critical T-0) +// PagerDuty: 1 (critical T-0 only) +// OpsGenie: 1 (critical T-0 only) +// Teams: 0 +// Webhook: 0 +func TestExpiryAlerts_PerTierFanOut(t *testing.T) { + ctx := context.Background() + f := newMatrixFixture(t) + + policy := &domain.RenewalPolicy{ + ID: "rp-fanout", + Name: "Fan-out Matrix", + RenewalWindowDays: 30, + AutoRenew: true, + MaxRetries: 3, + RetryInterval: 300, + AlertThresholdsDays: []int{30, 14, 7, 0}, + AlertChannels: map[string][]string{ + domain.AlertSeverityInformational: {"Slack"}, + domain.AlertSeverityWarning: {"Slack", "Email"}, + domain.AlertSeverityCritical: {"PagerDuty", "OpsGenie", "Email"}, + }, + // AlertSeverityMap nil → falls through to DefaultAlertSeverityMap + // (30→informational, 14→warning, 7→warning, 0→critical) which is + // what we want here. + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + f.policyRepo.AddPolicy(policy) + + cert := newExpiringCert("mc-fanout", 0, "rp-fanout") + f.certRepo.AddCert(cert) + + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("CheckExpiringCertificates: %v", err) + } + + expected := map[string]int{ + "Slack": 3, + "Email": 3, + "PagerDuty": 1, + "OpsGenie": 1, + "Teams": 0, + "Webhook": 0, + } + for ch, want := range expected { + if got := f.notifs[ch].count(); got != want { + t.Errorf("channel %s: expected %d alerts, got %d", ch, want, got) + } + } + + // Spot-check the metric: PagerDuty should have exactly one + // {threshold=0, result=success} entry. + pdSuccess := totalEntries(f.metrics, func(e ExpiryAlertSnapshotEntry) bool { + return e.Channel == "PagerDuty" && e.Threshold == 0 && e.Result == "success" + }) + if pdSuccess != 1 { + t.Errorf("expected exactly 1 PagerDuty success at threshold=0, got %d", pdSuccess) + } +} + +// TestExpiryAlerts_PerChannelDedup pins that running the loop twice in +// a row at the same daysUntil produces ZERO new sends — every +// (cert, threshold, channel) row is in persistence already, so each +// channel deduplicates. +func TestExpiryAlerts_PerChannelDedup(t *testing.T) { + ctx := context.Background() + f := newMatrixFixture(t) + + policy := &domain.RenewalPolicy{ + ID: "rp-dedup", + Name: "Dedup Test", + RenewalWindowDays: 30, + AutoRenew: true, + MaxRetries: 3, + RetryInterval: 300, + AlertThresholdsDays: []int{30, 14, 7, 0}, + AlertChannels: map[string][]string{ + domain.AlertSeverityInformational: {"Slack"}, + domain.AlertSeverityWarning: {"Email"}, + domain.AlertSeverityCritical: {"PagerDuty"}, + }, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + f.policyRepo.AddPolicy(policy) + + cert := newExpiringCert("mc-dedup", 0, "rp-dedup") + f.certRepo.AddCert(cert) + + // First pass — every threshold should fire. + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("first CheckExpiringCertificates: %v", err) + } + totalAfterFirst := f.notifs["Slack"].count() + f.notifs["Email"].count() + f.notifs["PagerDuty"].count() + if totalAfterFirst == 0 { + t.Fatal("first pass produced zero alerts; matrix wiring broken") + } + + // Reset the cert's RenewalInProgress status so the second pass + // re-evaluates the thresholds (CheckExpiringCertificates skips + // RenewalInProgress certs after the first pass). + cert.Status = domain.CertificateStatusActive + _ = f.certRepo.Update(ctx, cert) + + // Second pass — every (cert, threshold, channel) row already in + // persistence; expect ZERO new sends. + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("second CheckExpiringCertificates: %v", err) + } + totalAfterSecond := f.notifs["Slack"].count() + f.notifs["Email"].count() + f.notifs["PagerDuty"].count() + if totalAfterSecond != totalAfterFirst { + t.Errorf("dedup failed: total alerts grew from %d to %d on second pass", totalAfterFirst, totalAfterSecond) + } + + // Deduped counter should be non-zero. + dedupedCount := totalEntries(f.metrics, func(e ExpiryAlertSnapshotEntry) bool { + return e.Result == "deduped" + }) + if dedupedCount == 0 { + t.Errorf("expected deduped counter to increment on second pass; got 0") + } +} + +// TestExpiryAlerts_OneChannelFails_OthersStillFire pins that one +// channel's failure does NOT suppress the others. PagerDuty rejects +// every send; Slack and Email succeed; the dispatch loop reports a +// failure-metric increment for PagerDuty, success for the others, and +// keeps the other channels' deliveries. +func TestExpiryAlerts_OneChannelFails_OthersStillFire(t *testing.T) { + ctx := context.Background() + f := newMatrixFixture(t) + + // PagerDuty mock returns error on every Send. + f.notifs["PagerDuty"].sendErr = errors.New("pagerduty 503: incident api down") + + policy := &domain.RenewalPolicy{ + ID: "rp-pdfail", + Name: "PagerDuty Fail", + RenewalWindowDays: 30, + AutoRenew: true, + MaxRetries: 3, + RetryInterval: 300, + AlertThresholdsDays: []int{0}, + AlertChannels: map[string][]string{ + domain.AlertSeverityCritical: {"PagerDuty", "Slack", "Email"}, + }, + AlertSeverityMap: map[int]string{0: domain.AlertSeverityCritical}, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + f.policyRepo.AddPolicy(policy) + + cert := newExpiringCert("mc-pdfail", 0, "rp-pdfail") + f.certRepo.AddCert(cert) + + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("CheckExpiringCertificates: %v", err) + } + + // Slack and Email got their messages. + if got := f.notifs["Slack"].count(); got != 1 { + t.Errorf("Slack expected 1 message even though PagerDuty failed, got %d", got) + } + if got := f.notifs["Email"].count(); got != 1 { + t.Errorf("Email expected 1 message even though PagerDuty failed, got %d", got) + } + if got := f.notifs["PagerDuty"].count(); got != 0 { + t.Errorf("PagerDuty failed; expected 0 stored messages, got %d", got) + } + + // Metric: PagerDuty should record failure; Slack + Email success. + pdFailure := totalEntries(f.metrics, func(e ExpiryAlertSnapshotEntry) bool { + return e.Channel == "PagerDuty" && e.Result == "failure" + }) + if pdFailure != 1 { + t.Errorf("expected 1 PagerDuty failure metric increment, got %d", pdFailure) + } + slackSuccess := totalEntries(f.metrics, func(e ExpiryAlertSnapshotEntry) bool { + return e.Channel == "Slack" && e.Result == "success" + }) + if slackSuccess != 1 { + t.Errorf("expected 1 Slack success metric increment, got %d", slackSuccess) + } +} + +// TestExpiryAlerts_OffEnumChannelDropped pins that an off-enum channel +// (operator typo: "PagerD") is silently dropped at the dispatch site +// without growing Prometheus cardinality. The drop is recorded in the +// audit log so an operator can grep + fix. +func TestExpiryAlerts_OffEnumChannelDropped(t *testing.T) { + ctx := context.Background() + f := newMatrixFixture(t) + + policy := &domain.RenewalPolicy{ + ID: "rp-typo", + Name: "Typo Test", + RenewalWindowDays: 30, + AutoRenew: true, + MaxRetries: 3, + RetryInterval: 300, + AlertThresholdsDays: []int{0}, + AlertChannels: map[string][]string{ + // "PagerD" is a typo — the real channel name is "PagerDuty". + // Slack is valid; should still fire. + domain.AlertSeverityCritical: {"PagerD", "Slack"}, + }, + AlertSeverityMap: map[int]string{0: domain.AlertSeverityCritical}, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + f.policyRepo.AddPolicy(policy) + + cert := newExpiringCert("mc-typo", 0, "rp-typo") + f.certRepo.AddCert(cert) + + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("CheckExpiringCertificates: %v", err) + } + + // Slack still fires. + if got := f.notifs["Slack"].count(); got != 1 { + t.Errorf("Slack expected 1 message; off-enum sibling should not block it; got %d", got) + } + + // Off-enum value never reached a notifier. + if got := f.notifs["PagerDuty"].count(); got != 0 { + t.Errorf("PagerDuty should be untouched (typo was 'PagerD'), got %d", got) + } + + // The metric does NOT have a "PagerD" entry — closed-enum + // discipline keeps cardinality bounded. + for _, e := range f.metrics.SnapshotExpiryAlerts() { + if e.Channel == "PagerD" { + t.Errorf("metric grew on off-enum channel typo: entry=%+v", e) + } + } + + // Audit log should record the drop. Look for the typed + // expiration_alert_skipped_invalid_channel event. + foundDropAudit := false + for _, ev := range f.auditRepo.Events { + if ev.Action == "expiration_alert_skipped_invalid_channel" { + foundDropAudit = true + break + } + } + if !foundDropAudit { + t.Errorf("expected expiration_alert_skipped_invalid_channel audit row for off-enum typo; not found") + } +} + +// TestExpiryAlerts_MetricCounterIncrements pins that every +// (channel, threshold, result) combination the dispatch loop produces +// shows up in the snapshot. Three tiers fire on a single cert with +// distinct channel sets per tier — the snapshot should carry one +// entry per (channel, threshold, "success") triple. +func TestExpiryAlerts_MetricCounterIncrements(t *testing.T) { + ctx := context.Background() + f := newMatrixFixture(t) + + policy := &domain.RenewalPolicy{ + ID: "rp-metric", + Name: "Metric Test", + RenewalWindowDays: 30, + AutoRenew: true, + MaxRetries: 3, + RetryInterval: 300, + AlertThresholdsDays: []int{30, 14, 0}, + AlertChannels: map[string][]string{ + domain.AlertSeverityInformational: {"Slack"}, + domain.AlertSeverityWarning: {"Email"}, + domain.AlertSeverityCritical: {"PagerDuty"}, + }, + AlertSeverityMap: map[int]string{ + 30: domain.AlertSeverityInformational, + 14: domain.AlertSeverityWarning, + 0: domain.AlertSeverityCritical, + }, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + f.policyRepo.AddPolicy(policy) + + cert := newExpiringCert("mc-metric", 0, "rp-metric") + f.certRepo.AddCert(cert) + + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("CheckExpiringCertificates: %v", err) + } + + snap := f.metrics.SnapshotExpiryAlerts() + + // Expect three (channel, threshold, success) entries. + want := map[string]bool{ + "Slack/30/success": false, + "Email/14/success": false, + "PagerDuty/0/success": false, + } + for _, e := range snap { + if e.Result != "success" { + continue + } + key := keyFromEntry(e) + if _, ok := want[key]; ok { + want[key] = true + } + } + for k, found := range want { + if !found { + t.Errorf("metric snapshot missing expected entry: %s", k) + } + } +} + +func keyFromEntry(e ExpiryAlertSnapshotEntry) string { + return e.Channel + "/" + intStr(e.Threshold) + "/" + e.Result +} + +func intStr(i int) string { + if i == 0 { + return "0" + } + negate := i < 0 + if negate { + i = -i + } + digits := []byte{} + for i > 0 { + digits = append([]byte{byte('0' + i%10)}, digits...) + i /= 10 + } + if negate { + digits = append([]byte("-"), digits...) + } + return string(digits) +} + +// TestExpiryAlerts_NilPolicy_FallsToDefault pins that a cert with no +// RenewalPolicy attached (RenewalPolicyID == "") gets the default +// Email-only matrix at every threshold tier. Same as +// TestExpiryAlerts_DefaultMatrix_EmailOnly but with a missing policy +// rather than a policy that has nil AlertChannels. +func TestExpiryAlerts_NilPolicy_FallsToDefault(t *testing.T) { + ctx := context.Background() + f := newMatrixFixture(t) + + cert := newExpiringCert("mc-nopolicy", 0, "") // empty RenewalPolicyID + f.certRepo.AddCert(cert) + + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("CheckExpiringCertificates: %v", err) + } + + if got := f.notifs["Email"].count(); got != 4 { + t.Errorf("expected 4 Email alerts (default thresholds, default matrix), got %d", got) + } + for _, ch := range []string{"Slack", "Teams", "PagerDuty", "OpsGenie", "Webhook"} { + if got := f.notifs[ch].count(); got != 0 { + t.Errorf("expected 0 %s alerts when policy is missing, got %d", ch, got) + } + } +} + +// TestExpiryAlerts_OperatorOptOutOfTier pins that an explicit empty +// list at a tier causes the dispatch loop to fire ZERO alerts for +// that tier, while other tiers continue to work. Operators use this +// to opt out of T-30 informational alerts (e.g. "we don't want to +// hear about a cert until it's a real warning"). +func TestExpiryAlerts_OperatorOptOutOfTier(t *testing.T) { + ctx := context.Background() + f := newMatrixFixture(t) + + policy := &domain.RenewalPolicy{ + ID: "rp-optout", + Name: "Opt-out Test", + RenewalWindowDays: 30, + AutoRenew: true, + MaxRetries: 3, + RetryInterval: 300, + AlertThresholdsDays: []int{30, 14, 0}, + AlertChannels: map[string][]string{ + // Operator opted out of informational entirely. + domain.AlertSeverityInformational: {}, + domain.AlertSeverityWarning: {"Email"}, + domain.AlertSeverityCritical: {"PagerDuty", "Email"}, + }, + AlertSeverityMap: map[int]string{ + 30: domain.AlertSeverityInformational, + 14: domain.AlertSeverityWarning, + 0: domain.AlertSeverityCritical, + }, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + f.policyRepo.AddPolicy(policy) + + cert := newExpiringCert("mc-optout", 0, "rp-optout") + f.certRepo.AddCert(cert) + + if err := f.rs.CheckExpiringCertificates(ctx); err != nil { + t.Fatalf("CheckExpiringCertificates: %v", err) + } + + // Email: 1 warning (T-14) + 1 critical (T-0) = 2. + if got := f.notifs["Email"].count(); got != 2 { + t.Errorf("Email expected 2 alerts (warning + critical), got %d", got) + } + // PagerDuty: 1 critical only. + if got := f.notifs["PagerDuty"].count(); got != 1 { + t.Errorf("PagerDuty expected 1 alert (critical), got %d", got) + } + // Slack/Teams/OpsGenie/Webhook: 0. + for _, ch := range []string{"Slack", "Teams", "OpsGenie", "Webhook"} { + if got := f.notifs[ch].count(); got != 0 { + t.Errorf("expected 0 %s alerts (informational opt-out), got %d", ch, got) + } + } + + // Audit row for the opt-out tier (informational @ threshold=30). + foundSkipAudit := false + for _, ev := range f.auditRepo.Events { + if ev.Action == "expiration_alert_skipped_no_channels" { + foundSkipAudit = true + break + } + } + if !foundSkipAudit { + t.Errorf("expected expiration_alert_skipped_no_channels audit row for opted-out tier; not found") + } +} diff --git a/internal/service/testutil_test.go b/internal/service/testutil_test.go index 86af28d..e770cac 100644 --- a/internal/service/testutil_test.go +++ b/internal/service/testutil_test.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "sort" + "strings" "sync" "time" @@ -504,7 +505,43 @@ func (m *mockNotifRepo) List(ctx context.Context, filter *repository.Notificatio if m.ListErr != nil { return nil, m.ListErr } - return m.Notifications, nil + if filter == nil { + out := make([]*domain.NotificationEvent, len(m.Notifications)) + copy(out, m.Notifications) + return out, nil + } + // Apply each non-zero filter field. Mirror the postgres notification + // repo's WHERE-clause shape (CertificateID, Type, Status, Channel, + // MessageLike) so the multi-channel expiry-alert tests + // (renewal_expiry_alerts_test.go, Rank 4 of the 2026-05-03 Infisical + // deep-research deliverable) get the same per-(cert, threshold, + // channel) dedup behaviour they'd see in production. Pre-Rank 4 the + // mock returned all rows regardless of filter; legacy callers + // happened to work because their assertions were "any notification + // fired" rather than "this specific (cert,threshold,channel) one". + out := make([]*domain.NotificationEvent, 0, len(m.Notifications)) + msgSubstring := strings.Trim(filter.MessageLike, "%") + for _, n := range m.Notifications { + if filter.CertificateID != "" { + if n.CertificateID == nil || *n.CertificateID != filter.CertificateID { + continue + } + } + if filter.Type != "" && string(n.Type) != filter.Type { + continue + } + if filter.Status != "" && n.Status != filter.Status { + continue + } + if filter.Channel != "" && string(n.Channel) != filter.Channel { + continue + } + if msgSubstring != "" && !strings.Contains(n.Message, msgSubstring) { + continue + } + out = append(out, n) + } + return out, nil } func (m *mockNotifRepo) UpdateStatus(ctx context.Context, id string, status string, sentAt time.Time) error { diff --git a/migrations/000026_renewal_policy_channel_matrix.down.sql b/migrations/000026_renewal_policy_channel_matrix.down.sql new file mode 100644 index 0000000..b2856a7 --- /dev/null +++ b/migrations/000026_renewal_policy_channel_matrix.down.sql @@ -0,0 +1,7 @@ +-- Down migration for 000026 — drop the per-policy channel-matrix +-- columns. IF EXISTS makes this safe to apply on a database that +-- was never upgraded (no-op). + +ALTER TABLE renewal_policies + DROP COLUMN IF EXISTS alert_channels, + DROP COLUMN IF EXISTS alert_severity_map; diff --git a/migrations/000026_renewal_policy_channel_matrix.up.sql b/migrations/000026_renewal_policy_channel_matrix.up.sql new file mode 100644 index 0000000..085cc4b --- /dev/null +++ b/migrations/000026_renewal_policy_channel_matrix.up.sql @@ -0,0 +1,23 @@ +-- Rank 4 of the 2026-05-03 Infisical deep-research deliverable +-- (cowork/infisical-deep-research-results.md Part 5). Adds the +-- per-policy channel matrix that the multi-channel expiry-alert +-- routing reads from. Two JSONB columns: +-- +-- alert_channels — map[severity_tier][]channel_name. Default +-- is '{}' so the runtime falls through to +-- domain.DefaultAlertChannels() (Email-only +-- across all tiers, the back-compat +-- behaviour). +-- alert_severity_map — map[threshold_days]severity_tier. Default +-- is '{}' so the runtime falls through to +-- domain.DefaultAlertSeverityMap() (the +-- canonical 30/14/7/0 → informational/warning/ +-- warning/critical mapping). +-- +-- Both columns use IF NOT EXISTS so the migration is idempotent — +-- safe to re-run on every certctl-server boot per the +-- "Idempotent migrations" architecture decision in CLAUDE.md. + +ALTER TABLE renewal_policies + ADD COLUMN IF NOT EXISTS alert_channels JSONB NOT NULL DEFAULT '{}'::jsonb, + ADD COLUMN IF NOT EXISTS alert_severity_map JSONB NOT NULL DEFAULT '{}'::jsonb;