Files
certctl/internal/service/renewal_expiry_alerts_test.go
shankar0123 75097909e9
2026-05-05 18:18:29 +00:00

655 lines
21 KiB
Go

package service
// Rank 4 of the 2026-05-03 Infisical deep-research deliverable
// (the project's deep-research deliverable, Part 5). Pins every leg of
// the per-policy multi-channel expiry-alert fan-out matrix:
//
// 1. Default matrix → Email-only at every tier (back-compat).
// 2. Per-tier fan-out — informational/warning/critical each route to
// a different channel set; cert at 0 days remaining crosses all
// four canonical thresholds; assert the exact recipient calls per
// channel.
// 3. Per-(cert, threshold, channel) dedup — second loop tick produces
// zero sends; deduped counter increments instead.
// 4. One-channel fails → others still fire; failure metric increments;
// success metric increments for the channels that succeeded.
// 5. Off-enum channel typo dropped at dispatch + audit-row trail.
// 6. Metric counter increments for every (channel, threshold, result)
// combination the loop produces.
// 7. Nil policy → default matrix (cert with no RenewalPolicy
// attached).
// 8. Operator opt-out of a tier (empty list) — that tier fires zero
// alerts; other tiers unaffected.
import (
"context"
"errors"
"log/slog"
"sync"
"testing"
"time"
"github.com/certctl-io/certctl/internal/domain"
)
// channelMockNotifier records (recipient, subject, body) per Send call.
// Replaces the simple mockNotifier from testutil_test.go for tests that
// need to verify which channel got which message — channelMockNotifier
// stamps every recorded message with its channel name so tests can
// distinguish Slack-vs-PagerDuty-vs-Email after a single fan-out.
type channelMockNotifier struct {
mu sync.Mutex
channel string
messages []channelNotifierMsg
sendErr error
}
type channelNotifierMsg struct {
Channel string
Recipient string
Subject string
Body string
}
func newChannelMockNotifier(channel string) *channelMockNotifier {
return &channelMockNotifier{channel: channel}
}
func (m *channelMockNotifier) Send(ctx context.Context, recipient string, subject string, body string) error {
m.mu.Lock()
defer m.mu.Unlock()
if m.sendErr != nil {
return m.sendErr
}
m.messages = append(m.messages, channelNotifierMsg{
Channel: m.channel,
Recipient: recipient,
Subject: subject,
Body: body,
})
return nil
}
func (m *channelMockNotifier) Channel() string { return m.channel }
func (m *channelMockNotifier) count() int {
m.mu.Lock()
defer m.mu.Unlock()
return len(m.messages)
}
// matrixFixture wires the full set of objects each per-tier-matrix test
// needs — six channel-aware notifiers, the metric recorder, the
// notification service, and the renewal service. Tests vary only the
// policy and the cert.
type matrixFixture struct {
notifSvc *NotificationService
metrics *ExpiryAlertMetrics
rs *RenewalService
notifs map[string]*channelMockNotifier
notifRepo *mockNotifRepo
policyRepo *mockRenewalPolicyRepo
certRepo *mockCertRepo
auditRepo *mockAuditRepo
}
func newMatrixFixture(t *testing.T) *matrixFixture {
t.Helper()
notifs := map[string]*channelMockNotifier{
"Email": newChannelMockNotifier("Email"),
"Slack": newChannelMockNotifier("Slack"),
"Teams": newChannelMockNotifier("Teams"),
"PagerDuty": newChannelMockNotifier("PagerDuty"),
"OpsGenie": newChannelMockNotifier("OpsGenie"),
"Webhook": newChannelMockNotifier("Webhook"),
}
registry := map[string]Notifier{}
for k, n := range notifs {
registry[k] = n
}
notifRepo := newMockNotificationRepository()
notifSvc := NewNotificationService(notifRepo, registry)
metrics := NewExpiryAlertMetrics()
notifSvc.SetExpiryAlertMetrics(metrics)
certRepo := newMockCertificateRepository()
jobRepo := newMockJobRepository()
policyRepo := newMockRenewalPolicyRepository()
auditRepo := newMockAuditRepository()
auditSvc := NewAuditService(auditRepo)
issuerRegistry := NewIssuerRegistry(slog.Default())
issuerRegistry.Set("iss-test", &mockIssuerConnector{})
rs := NewRenewalService(certRepo, jobRepo, policyRepo, nil, auditSvc, notifSvc, issuerRegistry, "server")
return &matrixFixture{
notifSvc: notifSvc,
metrics: metrics,
rs: rs,
notifs: notifs,
notifRepo: notifRepo,
policyRepo: policyRepo,
certRepo: certRepo,
auditRepo: auditRepo,
}
}
func newExpiringCert(id string, daysFromNow int, policyID string) *domain.ManagedCertificate {
return &domain.ManagedCertificate{
ID: id,
Name: "Test Cert " + id,
CommonName: id + ".example.com",
SANs: []string{},
OwnerID: "owner-1",
TeamID: "team-1",
IssuerID: "iss-test",
RenewalPolicyID: policyID,
Status: domain.CertificateStatusActive,
ExpiresAt: time.Now().AddDate(0, 0, daysFromNow),
Tags: map[string]string{},
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
}
// totalEntries sums Count across every snapshot entry that matches the
// given filter func. Useful for "all-success", "all-failure" assertions
// without listing every (channel, threshold) tuple.
func totalEntries(metrics *ExpiryAlertMetrics, want func(ExpiryAlertSnapshotEntry) bool) uint64 {
var sum uint64
for _, e := range metrics.SnapshotExpiryAlerts() {
if want(e) {
sum += e.Count
}
}
return sum
}
// TestExpiryAlerts_DefaultMatrix_EmailOnly pins the back-compat
// contract: a policy with no AlertChannels matrix → the runtime falls
// through to DefaultAlertChannels (Email-only at every tier).
// PagerDuty / Slack / Teams / OpsGenie / Webhook receive ZERO alerts
// regardless of how many thresholds the cert has crossed.
func TestExpiryAlerts_DefaultMatrix_EmailOnly(t *testing.T) {
ctx := context.Background()
f := newMatrixFixture(t)
// Policy with no AlertChannels — fall through to default.
policy := &domain.RenewalPolicy{
ID: "rp-default-matrix",
Name: "Default Matrix",
RenewalWindowDays: 30,
AutoRenew: true,
MaxRetries: 3,
RetryInterval: 300,
AlertThresholdsDays: []int{30, 14, 7, 0},
// AlertChannels intentionally nil
// AlertSeverityMap intentionally nil
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
f.policyRepo.AddPolicy(policy)
cert := newExpiringCert("mc-default", 0, "rp-default-matrix")
f.certRepo.AddCert(cert)
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("CheckExpiringCertificates: %v", err)
}
if got := f.notifs["Email"].count(); got != 4 {
t.Errorf("expected 4 Email alerts (one per threshold), got %d", got)
}
for _, ch := range []string{"Slack", "Teams", "PagerDuty", "OpsGenie", "Webhook"} {
if got := f.notifs[ch].count(); got != 0 {
t.Errorf("expected 0 %s alerts in default-matrix mode, got %d", ch, got)
}
}
}
// TestExpiryAlerts_PerTierFanOut pins the operator-supplied matrix:
//
// informational → [Slack]
// warning → [Slack, Email]
// critical → [PagerDuty, OpsGenie, Email]
//
// With the canonical 30/14/7/0 thresholds and a cert at 0 days
// remaining (crosses all four), the dispatch loop should produce:
//
// Slack: 3 (informational T-30, warning T-14, warning T-7)
// Email: 3 (warning T-14, warning T-7, critical T-0)
// PagerDuty: 1 (critical T-0 only)
// OpsGenie: 1 (critical T-0 only)
// Teams: 0
// Webhook: 0
func TestExpiryAlerts_PerTierFanOut(t *testing.T) {
ctx := context.Background()
f := newMatrixFixture(t)
policy := &domain.RenewalPolicy{
ID: "rp-fanout",
Name: "Fan-out Matrix",
RenewalWindowDays: 30,
AutoRenew: true,
MaxRetries: 3,
RetryInterval: 300,
AlertThresholdsDays: []int{30, 14, 7, 0},
AlertChannels: map[string][]string{
domain.AlertSeverityInformational: {"Slack"},
domain.AlertSeverityWarning: {"Slack", "Email"},
domain.AlertSeverityCritical: {"PagerDuty", "OpsGenie", "Email"},
},
// AlertSeverityMap nil → falls through to DefaultAlertSeverityMap
// (30→informational, 14→warning, 7→warning, 0→critical) which is
// what we want here.
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
f.policyRepo.AddPolicy(policy)
cert := newExpiringCert("mc-fanout", 0, "rp-fanout")
f.certRepo.AddCert(cert)
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("CheckExpiringCertificates: %v", err)
}
expected := map[string]int{
"Slack": 3,
"Email": 3,
"PagerDuty": 1,
"OpsGenie": 1,
"Teams": 0,
"Webhook": 0,
}
for ch, want := range expected {
if got := f.notifs[ch].count(); got != want {
t.Errorf("channel %s: expected %d alerts, got %d", ch, want, got)
}
}
// Spot-check the metric: PagerDuty should have exactly one
// {threshold=0, result=success} entry.
pdSuccess := totalEntries(f.metrics, func(e ExpiryAlertSnapshotEntry) bool {
return e.Channel == "PagerDuty" && e.Threshold == 0 && e.Result == "success"
})
if pdSuccess != 1 {
t.Errorf("expected exactly 1 PagerDuty success at threshold=0, got %d", pdSuccess)
}
}
// TestExpiryAlerts_PerChannelDedup pins that running the loop twice in
// a row at the same daysUntil produces ZERO new sends — every
// (cert, threshold, channel) row is in persistence already, so each
// channel deduplicates.
func TestExpiryAlerts_PerChannelDedup(t *testing.T) {
ctx := context.Background()
f := newMatrixFixture(t)
policy := &domain.RenewalPolicy{
ID: "rp-dedup",
Name: "Dedup Test",
RenewalWindowDays: 30,
AutoRenew: true,
MaxRetries: 3,
RetryInterval: 300,
AlertThresholdsDays: []int{30, 14, 7, 0},
AlertChannels: map[string][]string{
domain.AlertSeverityInformational: {"Slack"},
domain.AlertSeverityWarning: {"Email"},
domain.AlertSeverityCritical: {"PagerDuty"},
},
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
f.policyRepo.AddPolicy(policy)
cert := newExpiringCert("mc-dedup", 0, "rp-dedup")
f.certRepo.AddCert(cert)
// First pass — every threshold should fire.
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("first CheckExpiringCertificates: %v", err)
}
totalAfterFirst := f.notifs["Slack"].count() + f.notifs["Email"].count() + f.notifs["PagerDuty"].count()
if totalAfterFirst == 0 {
t.Fatal("first pass produced zero alerts; matrix wiring broken")
}
// Reset the cert's RenewalInProgress status so the second pass
// re-evaluates the thresholds (CheckExpiringCertificates skips
// RenewalInProgress certs after the first pass).
cert.Status = domain.CertificateStatusActive
_ = f.certRepo.Update(ctx, cert)
// Second pass — every (cert, threshold, channel) row already in
// persistence; expect ZERO new sends.
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("second CheckExpiringCertificates: %v", err)
}
totalAfterSecond := f.notifs["Slack"].count() + f.notifs["Email"].count() + f.notifs["PagerDuty"].count()
if totalAfterSecond != totalAfterFirst {
t.Errorf("dedup failed: total alerts grew from %d to %d on second pass", totalAfterFirst, totalAfterSecond)
}
// Deduped counter should be non-zero.
dedupedCount := totalEntries(f.metrics, func(e ExpiryAlertSnapshotEntry) bool {
return e.Result == "deduped"
})
if dedupedCount == 0 {
t.Errorf("expected deduped counter to increment on second pass; got 0")
}
}
// TestExpiryAlerts_OneChannelFails_OthersStillFire pins that one
// channel's failure does NOT suppress the others. PagerDuty rejects
// every send; Slack and Email succeed; the dispatch loop reports a
// failure-metric increment for PagerDuty, success for the others, and
// keeps the other channels' deliveries.
func TestExpiryAlerts_OneChannelFails_OthersStillFire(t *testing.T) {
ctx := context.Background()
f := newMatrixFixture(t)
// PagerDuty mock returns error on every Send.
f.notifs["PagerDuty"].sendErr = errors.New("pagerduty 503: incident api down")
policy := &domain.RenewalPolicy{
ID: "rp-pdfail",
Name: "PagerDuty Fail",
RenewalWindowDays: 30,
AutoRenew: true,
MaxRetries: 3,
RetryInterval: 300,
AlertThresholdsDays: []int{0},
AlertChannels: map[string][]string{
domain.AlertSeverityCritical: {"PagerDuty", "Slack", "Email"},
},
AlertSeverityMap: map[int]string{0: domain.AlertSeverityCritical},
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
f.policyRepo.AddPolicy(policy)
cert := newExpiringCert("mc-pdfail", 0, "rp-pdfail")
f.certRepo.AddCert(cert)
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("CheckExpiringCertificates: %v", err)
}
// Slack and Email got their messages.
if got := f.notifs["Slack"].count(); got != 1 {
t.Errorf("Slack expected 1 message even though PagerDuty failed, got %d", got)
}
if got := f.notifs["Email"].count(); got != 1 {
t.Errorf("Email expected 1 message even though PagerDuty failed, got %d", got)
}
if got := f.notifs["PagerDuty"].count(); got != 0 {
t.Errorf("PagerDuty failed; expected 0 stored messages, got %d", got)
}
// Metric: PagerDuty should record failure; Slack + Email success.
pdFailure := totalEntries(f.metrics, func(e ExpiryAlertSnapshotEntry) bool {
return e.Channel == "PagerDuty" && e.Result == "failure"
})
if pdFailure != 1 {
t.Errorf("expected 1 PagerDuty failure metric increment, got %d", pdFailure)
}
slackSuccess := totalEntries(f.metrics, func(e ExpiryAlertSnapshotEntry) bool {
return e.Channel == "Slack" && e.Result == "success"
})
if slackSuccess != 1 {
t.Errorf("expected 1 Slack success metric increment, got %d", slackSuccess)
}
}
// TestExpiryAlerts_OffEnumChannelDropped pins that an off-enum channel
// (operator typo: "PagerD") is silently dropped at the dispatch site
// without growing Prometheus cardinality. The drop is recorded in the
// audit log so an operator can grep + fix.
func TestExpiryAlerts_OffEnumChannelDropped(t *testing.T) {
ctx := context.Background()
f := newMatrixFixture(t)
policy := &domain.RenewalPolicy{
ID: "rp-typo",
Name: "Typo Test",
RenewalWindowDays: 30,
AutoRenew: true,
MaxRetries: 3,
RetryInterval: 300,
AlertThresholdsDays: []int{0},
AlertChannels: map[string][]string{
// "PagerD" is a typo — the real channel name is "PagerDuty".
// Slack is valid; should still fire.
domain.AlertSeverityCritical: {"PagerD", "Slack"},
},
AlertSeverityMap: map[int]string{0: domain.AlertSeverityCritical},
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
f.policyRepo.AddPolicy(policy)
cert := newExpiringCert("mc-typo", 0, "rp-typo")
f.certRepo.AddCert(cert)
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("CheckExpiringCertificates: %v", err)
}
// Slack still fires.
if got := f.notifs["Slack"].count(); got != 1 {
t.Errorf("Slack expected 1 message; off-enum sibling should not block it; got %d", got)
}
// Off-enum value never reached a notifier.
if got := f.notifs["PagerDuty"].count(); got != 0 {
t.Errorf("PagerDuty should be untouched (typo was 'PagerD'), got %d", got)
}
// The metric does NOT have a "PagerD" entry — closed-enum
// discipline keeps cardinality bounded.
for _, e := range f.metrics.SnapshotExpiryAlerts() {
if e.Channel == "PagerD" {
t.Errorf("metric grew on off-enum channel typo: entry=%+v", e)
}
}
// Audit log should record the drop. Look for the typed
// expiration_alert_skipped_invalid_channel event.
foundDropAudit := false
for _, ev := range f.auditRepo.Events {
if ev.Action == "expiration_alert_skipped_invalid_channel" {
foundDropAudit = true
break
}
}
if !foundDropAudit {
t.Errorf("expected expiration_alert_skipped_invalid_channel audit row for off-enum typo; not found")
}
}
// TestExpiryAlerts_MetricCounterIncrements pins that every
// (channel, threshold, result) combination the dispatch loop produces
// shows up in the snapshot. Three tiers fire on a single cert with
// distinct channel sets per tier — the snapshot should carry one
// entry per (channel, threshold, "success") triple.
func TestExpiryAlerts_MetricCounterIncrements(t *testing.T) {
ctx := context.Background()
f := newMatrixFixture(t)
policy := &domain.RenewalPolicy{
ID: "rp-metric",
Name: "Metric Test",
RenewalWindowDays: 30,
AutoRenew: true,
MaxRetries: 3,
RetryInterval: 300,
AlertThresholdsDays: []int{30, 14, 0},
AlertChannels: map[string][]string{
domain.AlertSeverityInformational: {"Slack"},
domain.AlertSeverityWarning: {"Email"},
domain.AlertSeverityCritical: {"PagerDuty"},
},
AlertSeverityMap: map[int]string{
30: domain.AlertSeverityInformational,
14: domain.AlertSeverityWarning,
0: domain.AlertSeverityCritical,
},
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
f.policyRepo.AddPolicy(policy)
cert := newExpiringCert("mc-metric", 0, "rp-metric")
f.certRepo.AddCert(cert)
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("CheckExpiringCertificates: %v", err)
}
snap := f.metrics.SnapshotExpiryAlerts()
// Expect three (channel, threshold, success) entries.
want := map[string]bool{
"Slack/30/success": false,
"Email/14/success": false,
"PagerDuty/0/success": false,
}
for _, e := range snap {
if e.Result != "success" {
continue
}
key := keyFromEntry(e)
if _, ok := want[key]; ok {
want[key] = true
}
}
for k, found := range want {
if !found {
t.Errorf("metric snapshot missing expected entry: %s", k)
}
}
}
func keyFromEntry(e ExpiryAlertSnapshotEntry) string {
return e.Channel + "/" + intStr(e.Threshold) + "/" + e.Result
}
func intStr(i int) string {
if i == 0 {
return "0"
}
negate := i < 0
if negate {
i = -i
}
digits := []byte{}
for i > 0 {
digits = append([]byte{byte('0' + i%10)}, digits...)
i /= 10
}
if negate {
digits = append([]byte("-"), digits...)
}
return string(digits)
}
// TestExpiryAlerts_NilPolicy_FallsToDefault pins that a cert with no
// RenewalPolicy attached (RenewalPolicyID == "") gets the default
// Email-only matrix at every threshold tier. Same as
// TestExpiryAlerts_DefaultMatrix_EmailOnly but with a missing policy
// rather than a policy that has nil AlertChannels.
func TestExpiryAlerts_NilPolicy_FallsToDefault(t *testing.T) {
ctx := context.Background()
f := newMatrixFixture(t)
cert := newExpiringCert("mc-nopolicy", 0, "") // empty RenewalPolicyID
f.certRepo.AddCert(cert)
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("CheckExpiringCertificates: %v", err)
}
if got := f.notifs["Email"].count(); got != 4 {
t.Errorf("expected 4 Email alerts (default thresholds, default matrix), got %d", got)
}
for _, ch := range []string{"Slack", "Teams", "PagerDuty", "OpsGenie", "Webhook"} {
if got := f.notifs[ch].count(); got != 0 {
t.Errorf("expected 0 %s alerts when policy is missing, got %d", ch, got)
}
}
}
// TestExpiryAlerts_OperatorOptOutOfTier pins that an explicit empty
// list at a tier causes the dispatch loop to fire ZERO alerts for
// that tier, while other tiers continue to work. Operators use this
// to opt out of T-30 informational alerts (e.g. "we don't want to
// hear about a cert until it's a real warning").
func TestExpiryAlerts_OperatorOptOutOfTier(t *testing.T) {
ctx := context.Background()
f := newMatrixFixture(t)
policy := &domain.RenewalPolicy{
ID: "rp-optout",
Name: "Opt-out Test",
RenewalWindowDays: 30,
AutoRenew: true,
MaxRetries: 3,
RetryInterval: 300,
AlertThresholdsDays: []int{30, 14, 0},
AlertChannels: map[string][]string{
// Operator opted out of informational entirely.
domain.AlertSeverityInformational: {},
domain.AlertSeverityWarning: {"Email"},
domain.AlertSeverityCritical: {"PagerDuty", "Email"},
},
AlertSeverityMap: map[int]string{
30: domain.AlertSeverityInformational,
14: domain.AlertSeverityWarning,
0: domain.AlertSeverityCritical,
},
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
f.policyRepo.AddPolicy(policy)
cert := newExpiringCert("mc-optout", 0, "rp-optout")
f.certRepo.AddCert(cert)
if err := f.rs.CheckExpiringCertificates(ctx); err != nil {
t.Fatalf("CheckExpiringCertificates: %v", err)
}
// Email: 1 warning (T-14) + 1 critical (T-0) = 2.
if got := f.notifs["Email"].count(); got != 2 {
t.Errorf("Email expected 2 alerts (warning + critical), got %d", got)
}
// PagerDuty: 1 critical only.
if got := f.notifs["PagerDuty"].count(); got != 1 {
t.Errorf("PagerDuty expected 1 alert (critical), got %d", got)
}
// Slack/Teams/OpsGenie/Webhook: 0.
for _, ch := range []string{"Slack", "Teams", "OpsGenie", "Webhook"} {
if got := f.notifs[ch].count(); got != 0 {
t.Errorf("expected 0 %s alerts (informational opt-out), got %d", ch, got)
}
}
// Audit row for the opt-out tier (informational @ threshold=30).
foundSkipAudit := false
for _, ev := range f.auditRepo.Events {
if ev.Action == "expiration_alert_skipped_no_channels" {
foundSkipAudit = true
break
}
}
if !foundSkipAudit {
t.Errorf("expected expiration_alert_skipped_no_channels audit row for opted-out tier; not found")
}
}