diff --git a/cmd/server/main.go b/cmd/server/main.go index 0c168c3..276cac9 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -225,6 +225,9 @@ func main() { renewalService.SetTargetRepo(targetRepo) deploymentService := service.NewDeploymentService(jobRepo, targetRepo, agentRepo, certificateRepo, auditService, notificationService) jobService := service.NewJobService(jobRepo, certificateRepo, ownerRepo, renewalService, deploymentService, logger) + // I-001: emit "job_retry" audit events when the scheduler resets Failed→Pending. + // SetAuditService is optional — JobService falls back to nil-guarded no-op if unwired. + jobService.SetAuditService(auditService) agentService := service.NewAgentService(agentRepo, certificateRepo, jobRepo, targetRepo, auditService, issuerRegistry, renewalService) agentService.SetProfileRepo(profileRepo) issuerService := service.NewIssuerService(issuerRepo, auditService, issuerRegistry, encryptionKey, logger) @@ -438,6 +441,10 @@ func main() { // Configure scheduler intervals from config sched.SetRenewalCheckInterval(cfg.Scheduler.RenewalCheckInterval) sched.SetJobProcessorInterval(cfg.Scheduler.JobProcessorInterval) + // I-001: drive the failed-job retry loop. Runs on start + every RetryInterval + // (default 5m, CERTCTL_SCHEDULER_RETRY_INTERVAL). Kept adjacent to the job + // processor setter because they share the JobServicer dependency. + sched.SetJobRetryInterval(cfg.Scheduler.RetryInterval) sched.SetAgentHealthCheckInterval(cfg.Scheduler.AgentHealthCheckInterval) sched.SetNotificationProcessInterval(cfg.Scheduler.NotificationProcessInterval) if cfg.NetworkScan.Enabled { diff --git a/internal/config/config.go b/internal/config/config.go index 094a2d7..f9f4506 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -707,6 +707,14 @@ type SchedulerConfig struct { // Default: 1 minute. Minimum: 1 second. Sends notifications to Slack, Teams, PagerDuty, etc. // Setting: CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL environment variable. NotificationProcessInterval time.Duration + + // RetryInterval is how often the scheduler retries failed jobs whose Attempts + // counter is below MaxAttempts. Default: 5 minutes. Minimum: 1 second. + // Transitions eligible Failed jobs back to Pending so the job processor can + // pick them up again (closes coverage gap I-001 — JobService.RetryFailedJobs + // had no caller prior to this loop being wired). + // Setting: CERTCTL_SCHEDULER_RETRY_INTERVAL environment variable. + RetryInterval time.Duration } // LogConfig contains logging configuration. @@ -807,6 +815,7 @@ func Load() (*Config, error) { JobProcessorInterval: getEnvDuration("CERTCTL_SCHEDULER_JOB_PROCESSOR_INTERVAL", 30*time.Second), AgentHealthCheckInterval: getEnvDuration("CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL", 2*time.Minute), NotificationProcessInterval: getEnvDuration("CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL", 1*time.Minute), + RetryInterval: getEnvDuration("CERTCTL_SCHEDULER_RETRY_INTERVAL", 5*time.Minute), }, Log: LogConfig{ Level: getEnv("CERTCTL_LOG_LEVEL", "info"), @@ -1074,6 +1083,10 @@ func (c *Config) Validate() error { return fmt.Errorf("notification process interval must be at least 1 second") } + if c.Scheduler.RetryInterval < 1*time.Second { + return fmt.Errorf("retry interval must be at least 1 second") + } + return nil } diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 7ff63ef..2c4e54c 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -16,8 +16,16 @@ type RenewalServicer interface { } // JobServicer defines the interface for job processing used by the scheduler. +// +// RetryFailedJobs was added to close coverage gap I-001: JobService.RetryFailedJobs +// existed and was unit-tested but had no runtime caller prior to this loop being +// wired. The scheduler now drives it on an independent tick so failed jobs whose +// attempt counter is below MaxAttempts are periodically reset to Pending for the +// job processor to pick up again. maxRetries is advisory (per-job gating uses +// each job's own Attempts/MaxAttempts fields). type JobServicer interface { ProcessPendingJobs(ctx context.Context) error + RetryFailedJobs(ctx context.Context, maxRetries int) error } // AgentServicer defines the interface for agent health checks used by the scheduler. @@ -67,6 +75,7 @@ type Scheduler struct { // Configurable tick intervals renewalCheckInterval time.Duration jobProcessorInterval time.Duration + jobRetryInterval time.Duration agentHealthCheckInterval time.Duration notificationProcessInterval time.Duration shortLivedExpiryCheckInterval time.Duration @@ -78,6 +87,7 @@ type Scheduler struct { // Idempotency guards: prevent duplicate execution of slow jobs renewalCheckRunning atomic.Bool jobProcessorRunning atomic.Bool + jobRetryRunning atomic.Bool agentHealthCheckRunning atomic.Bool notificationProcessRunning atomic.Bool shortLivedExpiryCheckRunning atomic.Bool @@ -110,6 +120,7 @@ func NewScheduler( // Default intervals renewalCheckInterval: 1 * time.Hour, jobProcessorInterval: 30 * time.Second, + jobRetryInterval: 5 * time.Minute, agentHealthCheckInterval: 2 * time.Minute, notificationProcessInterval: 1 * time.Minute, shortLivedExpiryCheckInterval: 30 * time.Second, @@ -141,6 +152,13 @@ func (s *Scheduler) SetJobProcessorInterval(d time.Duration) { s.jobProcessorInterval = d } +// SetJobRetryInterval configures the interval for the failed-job retry loop +// (coverage gap I-001). Defaults to 5 minutes; honors +// CERTCTL_SCHEDULER_RETRY_INTERVAL when wired from config. +func (s *Scheduler) SetJobRetryInterval(d time.Duration) { + s.jobRetryInterval = d +} + // SetAgentHealthCheckInterval configures the interval for agent health checks. func (s *Scheduler) SetAgentHealthCheckInterval(d time.Duration) { s.agentHealthCheckInterval = d @@ -193,7 +211,10 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} { // Track all loop goroutines in the WaitGroup so WaitForCompletion // blocks until they've fully exited (prevents test races). - loopCount := 5 + // Base count is 6: renewal, job processor, job retry (I-001), + // agent health, notification, short-lived expiry. Optional loops + // (network scan, digest, health check, cloud discovery) add to this. + loopCount := 6 if s.networkScanService != nil { loopCount++ } @@ -210,6 +231,7 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} { go func() { defer s.wg.Done(); s.renewalCheckLoop(ctx) }() go func() { defer s.wg.Done(); s.jobProcessorLoop(ctx) }() + go func() { defer s.wg.Done(); s.jobRetryLoop(ctx) }() go func() { defer s.wg.Done(); s.agentHealthCheckLoop(ctx) }() go func() { defer s.wg.Done(); s.notificationProcessLoop(ctx) }() go func() { defer s.wg.Done(); s.shortLivedExpiryCheckLoop(ctx) }() @@ -334,6 +356,63 @@ func (s *Scheduler) runJobProcessor(ctx context.Context) { } } +// jobRetryLoop runs every jobRetryInterval and transitions eligible Failed jobs +// back to Pending so the job processor can pick them up again. Closes coverage +// gap I-001 — JobService.RetryFailedJobs had no runtime caller prior to this +// loop being wired. Runs immediately on start, then every interval. +// Uses atomic.Bool to prevent duplicate execution if the previous retry sweep +// is still running. +func (s *Scheduler) jobRetryLoop(ctx context.Context) { + ticker := time.NewTicker(s.jobRetryInterval) + defer ticker.Stop() + + // Run immediately on start (with idempotency guard) + s.jobRetryRunning.Store(true) + s.wg.Add(1) + go func() { + defer s.wg.Done() + defer s.jobRetryRunning.Store(false) + s.runJobRetry(ctx) + }() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if !s.jobRetryRunning.CompareAndSwap(false, true) { + s.logger.Warn("job retry still running, skipping tick") + continue + } + s.wg.Add(1) + go func() { + defer s.wg.Done() + defer s.jobRetryRunning.Store(false) + s.runJobRetry(ctx) + }() + } + } +} + +// runJobRetry executes a single failed-job retry cycle with error recovery. +// Uses the same 2-minute per-tick timeout as runJobProcessor; RetryFailedJobs +// issues one SELECT and one UPDATE per eligible job (cheap), so this headroom +// covers very large failure backlogs without starving the loop. +func (s *Scheduler) runJobRetry(ctx context.Context) { + opCtx, cancel := context.WithTimeout(ctx, 2*time.Minute) + defer cancel() + // maxRetries is advisory at the service layer (per-job gating uses each + // job's own Attempts/MaxAttempts). Passing 3 matches the conventional + // default seen across the codebase's job creation paths. + if err := s.jobService.RetryFailedJobs(opCtx, 3); err != nil { + s.logger.Error("job retry failed", + "error", err, + "interval", s.jobRetryInterval.String()) + } else { + s.logger.Debug("job retry completed") + } +} + // agentHealthCheckLoop runs every agentHealthCheckInterval and marks stale agents as offline. // An agent is considered stale if it hasn't sent a heartbeat within the health check interval. // If an error occurs, it logs the error but continues running. diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index 1fe2d64..1e4692e 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -68,12 +68,23 @@ func (m *mockRenewalService) ExpireShortLivedCertificates(ctx context.Context) e } // mockJobService is a mock implementation for testing. +// +// Tracks ProcessPendingJobs and RetryFailedJobs separately. retrySlowDelay and +// retryShouldError let tests exercise the retry loop independently of the +// processor loop without coupling their timing/failure modes. type mockJobService struct { mu sync.Mutex callCount int callTimes []time.Time slowDelay time.Duration shouldError bool + + // Retry loop tracking (coverage gap I-001) + retryCallCount int + retryCallTimes []time.Time + retryMaxRetriesSeen []int + retrySlowDelay time.Duration + retryShouldError bool } func (m *mockJobService) ProcessPendingJobs(ctx context.Context) error { @@ -96,6 +107,30 @@ func (m *mockJobService) ProcessPendingJobs(ctx context.Context) error { return nil } +// RetryFailedJobs is the scheduler-driven counterpart to ProcessPendingJobs that +// covers coverage gap I-001: JobService.RetryFailedJobs had no runtime caller +// prior to the jobRetryLoop being wired. +func (m *mockJobService) RetryFailedJobs(ctx context.Context, maxRetries int) error { + m.mu.Lock() + m.retryCallCount++ + m.retryCallTimes = append(m.retryCallTimes, time.Now()) + m.retryMaxRetriesSeen = append(m.retryMaxRetriesSeen, maxRetries) + m.mu.Unlock() + + if m.retrySlowDelay > 0 { + select { + case <-time.After(m.retrySlowDelay): + case <-ctx.Done(): + return ctx.Err() + } + } + + if m.retryShouldError { + return context.Canceled + } + return nil +} + // mockAgentService is a mock implementation for testing. type mockAgentService struct { mu sync.Mutex @@ -948,3 +983,161 @@ func TestScheduler_DigestLoop_SetDigestInterval(t *testing.T) { t.Errorf("digestInterval should be %v after SetDigestInterval, got %v", customInterval, sched.digestInterval) } } + +// TestScheduler_JobRetryLoop_CallsService verifies that the job retry loop +// invokes JobService.RetryFailedJobs on each tick. Closes coverage gap I-001 — +// prior to the loop being wired, RetryFailedJobs had no runtime caller. +// +// Also verifies that the scheduler forwards the conventional advisory maxRetries +// constant (3) to the service layer; per-job gating still lives in each job's +// own Attempts/MaxAttempts fields. +func TestScheduler_JobRetryLoop_CallsService(t *testing.T) { + logger := slog.New(slog.NewTextHandler(os.Stderr, nil)) + renewalMock := &mockRenewalService{} + jobMock := &mockJobService{} + agentMock := &mockAgentService{} + notificationMock := &mockNotificationService{} + networkMock := &mockNetworkScanService{} + + sched := NewScheduler(renewalMock, jobMock, agentMock, notificationMock, networkMock, logger) + // Quiet every other loop so only the retry loop's calls are visible on jobMock. + sched.SetRenewalCheckInterval(10 * time.Second) + sched.SetJobProcessorInterval(10 * time.Second) + sched.SetAgentHealthCheckInterval(10 * time.Second) + sched.SetNotificationProcessInterval(10 * time.Second) + sched.SetNetworkScanInterval(10 * time.Second) + sched.SetJobRetryInterval(50 * time.Millisecond) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + startedChan := sched.Start(ctx) + <-startedChan + + // Run long enough for the immediate start + at least one tick. + time.Sleep(200 * time.Millisecond) + cancel() + _ = sched.WaitForCompletion(2 * time.Second) + + jobMock.mu.Lock() + retryCount := jobMock.retryCallCount + var firstMaxRetries int + if len(jobMock.retryMaxRetriesSeen) > 0 { + firstMaxRetries = jobMock.retryMaxRetriesSeen[0] + } + jobMock.mu.Unlock() + + if retryCount < 1 { + t.Fatalf("expected job retry service to be called at least once, got %d", retryCount) + } + if firstMaxRetries != 3 { + t.Fatalf("expected scheduler to forward advisory maxRetries=3, got %d", firstMaxRetries) + } + t.Logf("job retry loop called %d times (maxRetries=%d)", retryCount, firstMaxRetries) +} + +// TestScheduler_JobRetryLoop_IdempotencyGuard verifies that a slow retry sweep +// does not cause overlapping executions. Mirrors the shape of +// TestScheduler_DigestLoop_WithIdempotencyGuard. +// +// The guard is the atomic.Bool jobRetryRunning in scheduler.go. Without it, a +// 100ms tick against a 150ms operation would fire ~4 times in 400ms; with the +// guard we expect ~2–3 calls. +func TestScheduler_JobRetryLoop_IdempotencyGuard(t *testing.T) { + logger := slog.New(slog.NewTextHandler(os.Stderr, nil)) + renewalMock := &mockRenewalService{} + jobMock := &mockJobService{ + retrySlowDelay: 150 * time.Millisecond, // slower than tick interval + } + agentMock := &mockAgentService{} + notificationMock := &mockNotificationService{} + networkMock := &mockNetworkScanService{} + + sched := NewScheduler(renewalMock, jobMock, agentMock, notificationMock, networkMock, logger) + sched.SetRenewalCheckInterval(10 * time.Second) + sched.SetJobProcessorInterval(10 * time.Second) + sched.SetAgentHealthCheckInterval(10 * time.Second) + sched.SetNotificationProcessInterval(10 * time.Second) + sched.SetNetworkScanInterval(10 * time.Second) + sched.SetJobRetryInterval(100 * time.Millisecond) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + startedChan := sched.Start(ctx) + <-startedChan + + time.Sleep(400 * time.Millisecond) + + jobMock.mu.Lock() + retryCount := jobMock.retryCallCount + jobMock.mu.Unlock() + + // With a 150ms sweep and 100ms interval, a functioning guard should yield + // roughly 2–3 calls (immediate + any ticks whose previous sweep finished). + // Anything above 3 suggests the guard isn't holding. + if retryCount > 3 { + t.Logf("WARNING: retry called %d times in 400ms with 100ms interval and 150ms sweep — guard may not be working", retryCount) + } + + t.Logf("job retry idempotency guard: %d calls in 400ms (100ms interval, 150ms sweep)", retryCount) + + cancel() + if err := sched.WaitForCompletion(2 * time.Second); err != nil { + t.Fatalf("WaitForCompletion should succeed: %v", err) + } +} + +// TestScheduler_JobRetryLoop_WaitForCompletion verifies that a retry sweep +// which is still in flight at shutdown is awaited by WaitForCompletion (same +// sync.WaitGroup contract as every other loop). +func TestScheduler_JobRetryLoop_WaitForCompletion(t *testing.T) { + logger := slog.New(slog.NewTextHandler(os.Stderr, nil)) + renewalMock := &mockRenewalService{} + jobMock := &mockJobService{ + retrySlowDelay: 100 * time.Millisecond, + } + agentMock := &mockAgentService{} + notificationMock := &mockNotificationService{} + networkMock := &mockNetworkScanService{} + + sched := NewScheduler(renewalMock, jobMock, agentMock, notificationMock, networkMock, logger) + sched.SetRenewalCheckInterval(10 * time.Second) + sched.SetJobProcessorInterval(10 * time.Second) + sched.SetAgentHealthCheckInterval(10 * time.Second) + sched.SetNotificationProcessInterval(10 * time.Second) + sched.SetNetworkScanInterval(10 * time.Second) + sched.SetJobRetryInterval(50 * time.Millisecond) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + startedChan := sched.Start(ctx) + <-startedChan + + // Let the immediate-start retry goroutine begin its 100ms sweep. + time.Sleep(30 * time.Millisecond) + + // Initiate shutdown mid-sweep. + cancel() + + start := time.Now() + err := sched.WaitForCompletion(5 * time.Second) + elapsed := time.Since(start) + + if err != nil { + t.Fatalf("WaitForCompletion should not error: %v", err) + } + if elapsed > 5*time.Second { + t.Fatalf("WaitForCompletion took longer than expected: %v", elapsed) + } + + jobMock.mu.Lock() + retryCount := jobMock.retryCallCount + jobMock.mu.Unlock() + + if retryCount < 1 { + t.Fatalf("expected retry service to have started at least once before shutdown, got %d", retryCount) + } + t.Logf("retry loop graceful shutdown completed in %v after %d in-flight sweep(s)", elapsed, retryCount) +} diff --git a/internal/service/job.go b/internal/service/job.go index 5c9fdeb..e034469 100644 --- a/internal/service/job.go +++ b/internal/service/job.go @@ -26,6 +26,7 @@ type JobService struct { ownerRepo repository.OwnerRepository renewalService *RenewalService deploymentService *DeploymentService + auditService *AuditService logger *slog.Logger } @@ -54,6 +55,15 @@ func NewJobService( } } +// SetAuditService wires an optional audit service for emitting lifecycle +// events (e.g., scheduler-driven job_retry transitions recorded by +// RetryFailedJobs). Construction keeps the audit dependency optional so +// bootstrap/test wiring that doesn't exercise the retry path can omit it; +// production wiring in cmd/server/main.go should always call this. +func (s *JobService) SetAuditService(a *AuditService) { + s.auditService = a +} + // ProcessPendingJobs fetches and processes all pending jobs. // It routes jobs to the appropriate service based on job type and handles errors gracefully. // @@ -163,6 +173,16 @@ func (s *JobService) processValidationJob(ctx context.Context, job *domain.Job) // RetryFailedJobs finds failed jobs and resets them for retry. // It only retries jobs that haven't exceeded max attempts. +// +// Audit trail (I-001): each successful Failed → Pending transition emits a +// "job_retry" audit event with actor "system" (ActorTypeSystem), capturing +// the old→new state and attempt counters so operators can reconstruct +// scheduler-driven retry activity. The audit service is optional — callers +// that haven't wired it via SetAuditService simply skip emission. +// +// maxRetries is retained for interface compatibility with +// scheduler.JobServicer but is advisory: per-job eligibility is governed by +// each job's own Attempts vs. MaxAttempts, not this parameter. func (s *JobService) RetryFailedJobs(ctx context.Context, maxRetries int) error { s.logger.Debug("retrying failed jobs", "max_retries", maxRetries) @@ -191,6 +211,21 @@ func (s *JobService) RetryFailedJobs(ctx context.Context, maxRetries int) error continue } + if s.auditService != nil { + if auditErr := s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem, + "job_retry", "job", job.ID, + map[string]interface{}{ + "old_status": string(domain.JobStatusFailed), + "new_status": string(domain.JobStatusPending), + "attempts": job.Attempts, + "max_attempts": job.MaxAttempts, + }); auditErr != nil { + s.logger.Error("failed to record job retry audit event", + "job_id", job.ID, + "error", auditErr) + } + } + retriedCount++ } diff --git a/internal/service/job_test.go b/internal/service/job_test.go index 94b2154..b0c1197 100644 --- a/internal/service/job_test.go +++ b/internal/service/job_test.go @@ -2,6 +2,7 @@ package service import ( "context" + "encoding/json" "errors" "log/slog" "os" @@ -398,3 +399,180 @@ func TestRejectJob_SelfRejection_Permitted(t *testing.T) { jobRepo.StatusUpdates["job-self"]) } } + +// --- I-001: scheduler-driven retry emits audit events --- +// +// These regression tests prove that RetryFailedJobs (a) transitions eligible +// Failed jobs to Pending, (b) skips jobs that have exhausted their max +// attempts, and (c) records a "job_retry" audit event per transition when the +// audit service is wired. A separate variant (_NoAuditServiceOK) confirms the +// nil-guard path so test/bootstrap wiring that skips the setter still works. + +// newTestJobServiceWithAudit wires the optional audit dependency onto the +// standard test JobService so retry assertions can inspect recorded events. +// Mirrors newTestJobServiceWithRepos but also returns the mock audit repo +// holding any emitted events. +func newTestJobServiceWithAudit(jobRepo *mockJobRepo) (*JobService, *mockAuditRepo) { + svc, _, _ := newTestJobServiceWithRepos(jobRepo) + auditRepo := &mockAuditRepo{} + svc.SetAuditService(NewAuditService(auditRepo)) + return svc, auditRepo +} + +func TestJobService_RetryFailedJobs_EligibleJobTransitionsAndAudits(t *testing.T) { + ctx := context.Background() + + now := time.Now() + failed := &domain.Job{ + ID: "job-retry-1", + Type: domain.JobTypeRenewal, + CertificateID: "cert-001", + Status: domain.JobStatusFailed, + Attempts: 1, + MaxAttempts: 3, + CreatedAt: now, + ScheduledAt: now, + } + jobRepo := &mockJobRepo{ + Jobs: map[string]*domain.Job{failed.ID: failed}, + StatusUpdates: make(map[string]domain.JobStatus), + } + + svc, auditRepo := newTestJobServiceWithAudit(jobRepo) + + if err := svc.RetryFailedJobs(ctx, 3); err != nil { + t.Fatalf("RetryFailedJobs failed: %v", err) + } + + if got := jobRepo.StatusUpdates[failed.ID]; got != domain.JobStatusPending { + t.Fatalf("expected job %s status Pending after retry, got %s", failed.ID, got) + } + + if len(auditRepo.Events) != 1 { + t.Fatalf("expected 1 audit event, got %d", len(auditRepo.Events)) + } + + ev := auditRepo.Events[0] + if ev.Action != "job_retry" { + t.Errorf("expected action job_retry, got %s", ev.Action) + } + if ev.Actor != "system" { + t.Errorf("expected actor system, got %s", ev.Actor) + } + if ev.ActorType != domain.ActorTypeSystem { + t.Errorf("expected actor type System, got %s", ev.ActorType) + } + if ev.ResourceType != "job" { + t.Errorf("expected resource type job, got %s", ev.ResourceType) + } + if ev.ResourceID != failed.ID { + t.Errorf("expected resource ID %s, got %s", failed.ID, ev.ResourceID) + } + + // Details are stored as json.RawMessage — decode and verify the state + // transition + attempt counters were captured. + var details map[string]interface{} + if err := json.Unmarshal(ev.Details, &details); err != nil { + t.Fatalf("failed to decode audit event details: %v", err) + } + if got, want := details["old_status"], string(domain.JobStatusFailed); got != want { + t.Errorf("expected details.old_status=%s, got %v", want, got) + } + if got, want := details["new_status"], string(domain.JobStatusPending); got != want { + t.Errorf("expected details.new_status=%s, got %v", want, got) + } + // JSON numerics round-trip as float64. + if got, want := details["attempts"], float64(1); got != want { + t.Errorf("expected details.attempts=%v, got %v", want, got) + } + if got, want := details["max_attempts"], float64(3); got != want { + t.Errorf("expected details.max_attempts=%v, got %v", want, got) + } +} + +func TestJobService_RetryFailedJobs_SkipsJobsAtMaxAttempts(t *testing.T) { + ctx := context.Background() + + now := time.Now() + // Eligible: Attempts=0, MaxAttempts=3. + eligible := &domain.Job{ + ID: "job-retry-eligible", + Type: domain.JobTypeRenewal, + CertificateID: "cert-001", + Status: domain.JobStatusFailed, + Attempts: 0, + MaxAttempts: 3, + CreatedAt: now, + ScheduledAt: now, + } + // Exhausted: Attempts >= MaxAttempts must be skipped. + exhausted := &domain.Job{ + ID: "job-retry-exhausted", + Type: domain.JobTypeDeployment, + CertificateID: "cert-002", + Status: domain.JobStatusFailed, + Attempts: 3, + MaxAttempts: 3, + CreatedAt: now, + ScheduledAt: now, + } + jobRepo := &mockJobRepo{ + Jobs: map[string]*domain.Job{ + eligible.ID: eligible, + exhausted.ID: exhausted, + }, + StatusUpdates: make(map[string]domain.JobStatus), + } + + svc, auditRepo := newTestJobServiceWithAudit(jobRepo) + + if err := svc.RetryFailedJobs(ctx, 3); err != nil { + t.Fatalf("RetryFailedJobs failed: %v", err) + } + + if got := jobRepo.StatusUpdates[eligible.ID]; got != domain.JobStatusPending { + t.Errorf("expected eligible job to transition to Pending, got %s", got) + } + if _, flipped := jobRepo.StatusUpdates[exhausted.ID]; flipped { + t.Errorf("expected exhausted job to be skipped, but status was updated") + } + + if len(auditRepo.Events) != 1 { + t.Fatalf("expected 1 audit event (only for eligible job), got %d", len(auditRepo.Events)) + } + if auditRepo.Events[0].ResourceID != eligible.ID { + t.Errorf("expected audit event for eligible job %s, got %s", + eligible.ID, auditRepo.Events[0].ResourceID) + } +} + +func TestJobService_RetryFailedJobs_NoAuditServiceOK(t *testing.T) { + ctx := context.Background() + + now := time.Now() + failed := &domain.Job{ + ID: "job-retry-no-audit", + Type: domain.JobTypeRenewal, + CertificateID: "cert-001", + Status: domain.JobStatusFailed, + Attempts: 0, + MaxAttempts: 3, + CreatedAt: now, + ScheduledAt: now, + } + jobRepo := &mockJobRepo{ + Jobs: map[string]*domain.Job{failed.ID: failed}, + StatusUpdates: make(map[string]domain.JobStatus), + } + + // Intentionally skip SetAuditService: the nil-guard must prevent a panic + // and still transition the job. + svc := newTestJobService(jobRepo) + + if err := svc.RetryFailedJobs(ctx, 3); err != nil { + t.Fatalf("RetryFailedJobs failed without audit wiring: %v", err) + } + if got := jobRepo.StatusUpdates[failed.ID]; got != domain.JobStatusPending { + t.Errorf("expected status Pending after retry, got %s", got) + } +}