Implement M3: expiration threshold alerting with dedup and status transitions

- Add alert_thresholds_days JSONB column to renewal_policies (default [30,14,7,0])
- Add RenewalPolicy.AlertThresholdsDays field + EffectiveAlertThresholds() helper
- Add RenewalPolicyRepository interface + postgres implementation
- Rewrite CheckExpiringCertificates with per-policy threshold alerting
- Add SendThresholdAlert + HasThresholdNotification for deduplication via [threshold:N] tags
- Add Type and MessageLike filters to NotificationFilter + postgres query support
- Auto-transition certs to Expiring (>0 days) or Expired (<=0 days) status
- Record expiration_alert_sent audit events per threshold crossing
- Fix .gitignore: allow SQL migration files, scope server/agent build artifact rules
- Track previously untracked cmd/ and migrations/ directories
- Update docs (README, architecture, demo-advanced) for threshold alerting

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
shankar0123
2026-03-15 00:03:43 -04:00
parent ae67b10708
commit 1d1b89c9b5
17 changed files with 1485 additions and 37 deletions
+22 -8
View File
@@ -54,12 +54,26 @@ const (
// RenewalPolicy defines renewal parameters for a managed certificate.
type RenewalPolicy struct {
ID string `json:"id"`
Name string `json:"name"`
RenewalWindowDays int `json:"renewal_window_days"`
AutoRenew bool `json:"auto_renew"`
MaxRetries int `json:"max_retries"`
RetryInterval int `json:"retry_interval_seconds"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
ID string `json:"id"`
Name string `json:"name"`
RenewalWindowDays int `json:"renewal_window_days"`
AutoRenew bool `json:"auto_renew"`
MaxRetries int `json:"max_retries"`
RetryInterval int `json:"retry_interval_seconds"`
AlertThresholdsDays []int `json:"alert_thresholds_days"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// DefaultAlertThresholds returns the standard alert thresholds when none are configured.
func DefaultAlertThresholds() []int {
return []int{30, 14, 7, 0}
}
// EffectiveAlertThresholds returns the configured thresholds or defaults if empty.
func (p *RenewalPolicy) EffectiveAlertThresholds() []int {
if len(p.AlertThresholdsDays) > 0 {
return p.AlertThresholdsDays
}
return DefaultAlertThresholds()
}
+2
View File
@@ -37,8 +37,10 @@ type AuditFilter struct {
// NotificationFilter defines filtering criteria for notification queries.
type NotificationFilter struct {
CertificateID string // optional: filter by certificate
Type string // optional: filter by notification type (e.g., "ExpirationWarning")
Status string // e.g., "pending", "sent", "failed"
Channel string // e.g., "email", "slack", "webhook"
MessageLike string // optional: LIKE match on message content (for threshold dedup)
Page int
PerPage int
}
+8
View File
@@ -97,6 +97,14 @@ type JobRepository interface {
GetPendingJobs(ctx context.Context, jobType domain.JobType) ([]*domain.Job, error)
}
// RenewalPolicyRepository defines operations for managing renewal policies.
type RenewalPolicyRepository interface {
// Get retrieves a renewal policy by ID.
Get(ctx context.Context, id string) (*domain.RenewalPolicy, error)
// List returns all renewal policies.
List(ctx context.Context) ([]*domain.RenewalPolicy, error)
}
// PolicyRepository defines operations for managing compliance policies and violations.
type PolicyRepository interface {
// ListRules returns all policy rules.
@@ -67,11 +67,21 @@ func (r *NotificationRepository) List(ctx context.Context, filter *repository.No
args = append(args, filter.CertificateID)
argCount++
}
if filter.Type != "" {
whereConditions = append(whereConditions, fmt.Sprintf("type = $%d", argCount))
args = append(args, filter.Type)
argCount++
}
if filter.Status != "" {
whereConditions = append(whereConditions, fmt.Sprintf("status = $%d", argCount))
args = append(args, filter.Status)
argCount++
}
if filter.MessageLike != "" {
whereConditions = append(whereConditions, fmt.Sprintf("message LIKE $%d", argCount))
args = append(args, filter.MessageLike)
argCount++
}
if filter.Channel != "" {
whereConditions = append(whereConditions, fmt.Sprintf("channel = $%d", argCount))
args = append(args, filter.Channel)
@@ -0,0 +1,92 @@
package postgres
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"github.com/shankar0123/certctl/internal/domain"
)
// RenewalPolicyRepository implements repository.RenewalPolicyRepository
type RenewalPolicyRepository struct {
db *sql.DB
}
// NewRenewalPolicyRepository creates a new RenewalPolicyRepository
func NewRenewalPolicyRepository(db *sql.DB) *RenewalPolicyRepository {
return &RenewalPolicyRepository{db: db}
}
// Get retrieves a renewal policy by ID
func (r *RenewalPolicyRepository) Get(ctx context.Context, id string) (*domain.RenewalPolicy, error) {
var policy domain.RenewalPolicy
var thresholdsJSON []byte
err := r.db.QueryRowContext(ctx, `
SELECT id, name, renewal_window_days, auto_renew, max_retries,
retry_interval_minutes, alert_thresholds_days, created_at, updated_at
FROM renewal_policies
WHERE id = $1
`, id).Scan(&policy.ID, &policy.Name, &policy.RenewalWindowDays, &policy.AutoRenew,
&policy.MaxRetries, &policy.RetryInterval, &thresholdsJSON,
&policy.CreatedAt, &policy.UpdatedAt)
if err != nil {
if err == sql.ErrNoRows {
return nil, fmt.Errorf("renewal policy not found: %s", id)
}
return nil, fmt.Errorf("failed to query renewal policy: %w", err)
}
// Parse alert thresholds from JSONB
if len(thresholdsJSON) > 0 {
if err := json.Unmarshal(thresholdsJSON, &policy.AlertThresholdsDays); err != nil {
// Fall back to defaults if JSON is malformed
policy.AlertThresholdsDays = domain.DefaultAlertThresholds()
}
}
return &policy, nil
}
// List returns all renewal policies
func (r *RenewalPolicyRepository) List(ctx context.Context) ([]*domain.RenewalPolicy, error) {
rows, err := r.db.QueryContext(ctx, `
SELECT id, name, renewal_window_days, auto_renew, max_retries,
retry_interval_minutes, alert_thresholds_days, created_at, updated_at
FROM renewal_policies
ORDER BY name
`)
if err != nil {
return nil, fmt.Errorf("failed to query renewal policies: %w", err)
}
defer rows.Close()
var policies []*domain.RenewalPolicy
for rows.Next() {
var policy domain.RenewalPolicy
var thresholdsJSON []byte
if err := rows.Scan(&policy.ID, &policy.Name, &policy.RenewalWindowDays, &policy.AutoRenew,
&policy.MaxRetries, &policy.RetryInterval, &thresholdsJSON,
&policy.CreatedAt, &policy.UpdatedAt); err != nil {
return nil, fmt.Errorf("failed to scan renewal policy: %w", err)
}
if len(thresholdsJSON) > 0 {
if err := json.Unmarshal(thresholdsJSON, &policy.AlertThresholdsDays); err != nil {
policy.AlertThresholdsDays = domain.DefaultAlertThresholds()
}
}
policies = append(policies, &policy)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("error iterating renewal policy rows: %w", err)
}
return policies, nil
}
+37 -5
View File
@@ -34,12 +34,26 @@ func NewNotificationService(
}
}
// SendExpirationWarning sends a certificate expiration warning.
// SendExpirationWarning sends a certificate expiration warning for a specific threshold.
func (s *NotificationService) SendExpirationWarning(ctx context.Context, cert *domain.ManagedCertificate, daysUntilExpiry int) error {
body := fmt.Sprintf(
"The certificate for %s will expire in %d days (%s).\n\nPlease schedule renewal.",
cert.CommonName, daysUntilExpiry, cert.ExpiresAt.Format("2006-01-02"),
)
return s.SendThresholdAlert(ctx, cert, daysUntilExpiry, daysUntilExpiry)
}
// SendThresholdAlert sends an expiration alert for a specific threshold (e.g., 30-day, 14-day, expired).
// The threshold parameter indicates which configured threshold triggered the alert.
func (s *NotificationService) SendThresholdAlert(ctx context.Context, cert *domain.ManagedCertificate, daysUntilExpiry int, threshold int) error {
var body string
if threshold <= 0 {
body = fmt.Sprintf(
"[EXPIRED] The certificate for %s has expired (%s).\n\nImmediate action required.\n\n[threshold:%d]",
cert.CommonName, cert.ExpiresAt.Format("2006-01-02"), threshold,
)
} else {
body = fmt.Sprintf(
"The certificate for %s will expire in %d days (%s).\n\nPlease schedule renewal.\n\n[threshold:%d]",
cert.CommonName, daysUntilExpiry, cert.ExpiresAt.Format("2006-01-02"), threshold,
)
}
// Create notification record
notif := &domain.NotificationEvent{
@@ -61,6 +75,24 @@ func (s *NotificationService) SendExpirationWarning(ctx context.Context, cert *d
return s.sendNotification(ctx, notif)
}
// HasThresholdNotification checks whether an expiration warning has already been sent
// for a specific certificate and threshold combination. Used for deduplication.
func (s *NotificationService) HasThresholdNotification(ctx context.Context, certID string, threshold int) (bool, error) {
filter := &repository.NotificationFilter{
CertificateID: certID,
Type: string(domain.NotificationTypeExpirationWarning),
MessageLike: fmt.Sprintf("%%[threshold:%d]%%", threshold),
PerPage: 1,
}
existing, err := s.notifRepo.List(ctx, filter)
if err != nil {
return false, fmt.Errorf("failed to check existing notifications: %w", err)
}
return len(existing) > 0, nil
}
// SendRenewalNotification sends a renewal success or failure notification.
func (s *NotificationService) SendRenewalNotification(ctx context.Context, cert *domain.ManagedCertificate, success bool, err error) error {
var body string
+111 -16
View File
@@ -18,11 +18,12 @@ import (
// RenewalService manages certificate renewal workflows.
type RenewalService struct {
certRepo repository.CertificateRepository
jobRepo repository.JobRepository
auditService *AuditService
notificationSvc *NotificationService
issuerRegistry map[string]IssuerConnector
certRepo repository.CertificateRepository
jobRepo repository.JobRepository
renewalPolicyRepo repository.RenewalPolicyRepository
auditService *AuditService
notificationSvc *NotificationService
issuerRegistry map[string]IssuerConnector
}
// IssuerConnector defines the service-layer interface for interacting with certificate issuers.
@@ -48,29 +49,37 @@ type IssuanceResult struct {
func NewRenewalService(
certRepo repository.CertificateRepository,
jobRepo repository.JobRepository,
renewalPolicyRepo repository.RenewalPolicyRepository,
auditService *AuditService,
notificationSvc *NotificationService,
issuerRegistry map[string]IssuerConnector,
) *RenewalService {
return &RenewalService{
certRepo: certRepo,
jobRepo: jobRepo,
auditService: auditService,
notificationSvc: notificationSvc,
issuerRegistry: issuerRegistry,
certRepo: certRepo,
jobRepo: jobRepo,
renewalPolicyRepo: renewalPolicyRepo,
auditService: auditService,
notificationSvc: notificationSvc,
issuerRegistry: issuerRegistry,
}
}
// CheckExpiringCertificates identifies certificates needing renewal based on policy windows.
// CheckExpiringCertificates identifies certificates needing renewal and sends threshold-based
// expiration alerts. For each certificate, it looks up the renewal policy's configured alert
// thresholds (default: 30, 14, 7, 0 days) and sends deduplicated notifications at each threshold.
// Certificates are also transitioned to Expiring/Expired status as appropriate.
func (s *RenewalService) CheckExpiringCertificates(ctx context.Context) error {
// Default renewal window: 30 days before expiry
renewalWindow := time.Now().AddDate(0, 0, 30)
// Use the maximum possible threshold window (30 days) plus buffer for query
renewalWindow := time.Now().AddDate(0, 0, 31)
expiring, err := s.certRepo.GetExpiringCertificates(ctx, renewalWindow)
if err != nil {
return fmt.Errorf("failed to fetch expiring certificates: %w", err)
}
// Cache renewal policies to avoid repeated lookups
policyCache := make(map[string]*domain.RenewalPolicy)
for _, cert := range expiring {
// Skip if already renewing or archived
if cert.Status == domain.CertificateStatusRenewalInProgress || cert.Status == domain.CertificateStatusArchived {
@@ -80,11 +89,31 @@ func (s *RenewalService) CheckExpiringCertificates(ctx context.Context) error {
// Calculate days until expiry
daysUntil := time.Until(cert.ExpiresAt).Hours() / 24
// Send expiration warning notification (always, regardless of issuer availability)
if err := s.notificationSvc.SendExpirationWarning(ctx, cert, int(daysUntil)); err != nil {
fmt.Printf("failed to send expiration warning for cert %s: %v\n", cert.ID, err)
// Look up renewal policy for alert thresholds
thresholds := domain.DefaultAlertThresholds()
if cert.RenewalPolicyID != "" {
policy, ok := policyCache[cert.RenewalPolicyID]
if !ok {
policy, err = s.renewalPolicyRepo.Get(ctx, cert.RenewalPolicyID)
if err != nil {
// Log but continue with defaults
fmt.Printf("failed to fetch renewal policy %s for cert %s, using defaults: %v\n",
cert.RenewalPolicyID, cert.ID, err)
} else {
policyCache[cert.RenewalPolicyID] = policy
}
}
if policy != nil {
thresholds = policy.EffectiveAlertThresholds()
}
}
// Update certificate status based on expiry
s.updateCertExpiryStatus(ctx, cert, daysUntil)
// Send threshold-based alerts with deduplication
s.sendThresholdAlerts(ctx, cert, int(daysUntil), thresholds)
// Only create renewal job if an issuer connector is registered for this cert's issuer
if _, hasIssuer := s.issuerRegistry[cert.IssuerID]; !hasIssuer {
continue
@@ -137,6 +166,72 @@ func (s *RenewalService) CheckExpiringCertificates(ctx context.Context) error {
return nil
}
// sendThresholdAlerts sends deduplicated expiration notifications based on configured thresholds.
// For each threshold that the certificate has crossed (e.g., ≤30 days, ≤14 days), it checks
// whether a notification for that threshold was already sent. Only new threshold crossings
// trigger notifications.
func (s *RenewalService) sendThresholdAlerts(ctx context.Context, cert *domain.ManagedCertificate, daysUntil int, thresholds []int) {
for _, threshold := range thresholds {
// Only alert if the cert has crossed this threshold (days remaining ≤ threshold)
if daysUntil > threshold {
continue
}
// Check if we already sent a notification for this threshold (deduplication)
alreadySent, err := s.notificationSvc.HasThresholdNotification(ctx, cert.ID, threshold)
if err != nil {
fmt.Printf("failed to check notification dedup for cert %s threshold %d: %v\n",
cert.ID, threshold, err)
continue
}
if alreadySent {
continue
}
// Send the threshold alert
if err := s.notificationSvc.SendThresholdAlert(ctx, cert, daysUntil, threshold); err != nil {
fmt.Printf("failed to send threshold alert for cert %s at %d days: %v\n",
cert.ID, threshold, err)
}
// Record audit event for the alert
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
"expiration_alert_sent", "certificate", cert.ID,
map[string]interface{}{
"threshold_days": threshold,
"days_until_expiry": daysUntil,
})
}
}
// updateCertExpiryStatus transitions a certificate to Expiring or Expired status based on
// how many days remain before expiry. Expired = 0 or fewer days, Expiring = within 30 days.
func (s *RenewalService) updateCertExpiryStatus(ctx context.Context, cert *domain.ManagedCertificate, daysUntil float64) {
var newStatus domain.CertificateStatus
if daysUntil <= 0 {
newStatus = domain.CertificateStatusExpired
} else {
newStatus = domain.CertificateStatusExpiring
}
// Only update if status is changing and cert isn't already in a terminal/active renewal state
if cert.Status == newStatus {
return
}
if cert.Status == domain.CertificateStatusRenewalInProgress ||
cert.Status == domain.CertificateStatusArchived ||
cert.Status == domain.CertificateStatusRevoked {
return
}
cert.Status = newStatus
cert.UpdatedAt = time.Now()
if err := s.certRepo.Update(ctx, cert); err != nil {
fmt.Printf("failed to update cert %s status to %s: %v\n", cert.ID, newStatus, err)
}
}
// ProcessRenewalJob executes a renewal job: generate CSR, call issuer, store new version,
// update cert status, and create deployment jobs for targets.
//