I-003: job timeout reaper closes AwaitingCSR/AwaitingApproval gap

Add 11th always-on scheduler loop that transitions jobs stuck in
AwaitingCSR (default 24h TTL) or AwaitingApproval (default 168h TTL)
to Failed. I-001's retry loop then auto-promotes eligible Failed jobs
back to Pending. No new status enum, no schema migration.

- JobRepository.ListTimedOutAwaitingJobs with per-status cutoff WHERE
- JobService.ReapTimedOutJobs mirrors RetryFailedJobs structure
- Scheduler jobTimeoutLoop with atomic.Bool idempotency guard, 2m
  per-tick context, WaitGroup shutdown drain
- Config: CERTCTL_JOB_TIMEOUT_INTERVAL (10m), CERTCTL_JOB_AWAITING_CSR_TIMEOUT
  (24h), CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT (168h)
- Audit event per transition: actor=system, actorType=System,
  action=job_timeout, details={old_status, new_status, timeout_reason,
  age_hours}
- 14 new tests: 3 config, 7 service, 4 scheduler
This commit is contained in:
shankar0123
2026-04-19 01:04:56 +00:00
parent 4bc8b3e723
commit 1ee77c89f8
12 changed files with 1020 additions and 14 deletions
+45 -11
View File
@@ -156,17 +156,20 @@ func (m *mockCertRepo) AddCert(cert *domain.ManagedCertificate) {
// mockJobRepo is a test implementation of JobRepository
type mockJobRepo struct {
mu sync.Mutex
Jobs map[string]*domain.Job
StatusUpdates map[string]domain.JobStatus
CreateErr error
UpdateErr error
UpdateStatusErr error
GetErr error
ListErr error
ListByStatusErr error
DeleteErr error
Updated []*domain.Job
mu sync.Mutex
Jobs map[string]*domain.Job
StatusUpdates map[string]domain.JobStatus
CreateErr error
UpdateErr error
UpdateErrorByID map[string]error
UpdateErrorByIDMu sync.Mutex
UpdateStatusErr error
GetErr error
ListErr error
ListByStatusErr error
DeleteErr error
ListTimedOutErr error
Updated []*domain.Job
}
func (m *mockJobRepo) List(ctx context.Context) ([]*domain.Job, error) {
@@ -211,6 +214,13 @@ func (m *mockJobRepo) Update(ctx context.Context, job *domain.Job) error {
if m.UpdateErr != nil {
return m.UpdateErr
}
// Check per-ID error injection
m.UpdateErrorByIDMu.Lock()
idErr, ok := m.UpdateErrorByID[job.ID]
m.UpdateErrorByIDMu.Unlock()
if ok && idErr != nil {
return idErr
}
m.Jobs[job.ID] = job
m.Updated = append(m.Updated, job)
return nil
@@ -352,6 +362,30 @@ func (m *mockJobRepo) ClaimPendingByAgentID(ctx context.Context, agentID string)
return result, nil
}
// ListTimedOutAwaitingJobs returns jobs stuck in AwaitingCSR/AwaitingApproval past the
// respective cutoffs. I-003 coverage-gap closure.
func (m *mockJobRepo) ListTimedOutAwaitingJobs(ctx context.Context, csrCutoff, approvalCutoff time.Time) ([]*domain.Job, error) {
m.mu.Lock()
defer m.mu.Unlock()
if m.ListTimedOutErr != nil {
return nil, m.ListTimedOutErr
}
var jobs []*domain.Job
for _, j := range m.Jobs {
switch j.Status {
case domain.JobStatusAwaitingCSR:
if j.CreatedAt.Before(csrCutoff) {
jobs = append(jobs, j)
}
case domain.JobStatusAwaitingApproval:
if j.CreatedAt.Before(approvalCutoff) {
jobs = append(jobs, j)
}
}
}
return jobs, nil
}
func (m *mockJobRepo) AddJob(job *domain.Job) {
m.mu.Lock()
defer m.mu.Unlock()