mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 15:01:32 +00:00
I-003: job timeout reaper closes AwaitingCSR/AwaitingApproval gap
Add 11th always-on scheduler loop that transitions jobs stuck in
AwaitingCSR (default 24h TTL) or AwaitingApproval (default 168h TTL)
to Failed. I-001's retry loop then auto-promotes eligible Failed jobs
back to Pending. No new status enum, no schema migration.
- JobRepository.ListTimedOutAwaitingJobs with per-status cutoff WHERE
- JobService.ReapTimedOutJobs mirrors RetryFailedJobs structure
- Scheduler jobTimeoutLoop with atomic.Bool idempotency guard, 2m
per-tick context, WaitGroup shutdown drain
- Config: CERTCTL_JOB_TIMEOUT_INTERVAL (10m), CERTCTL_JOB_AWAITING_CSR_TIMEOUT
(24h), CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT (168h)
- Audit event per transition: actor=system, actorType=System,
action=job_timeout, details={old_status, new_status, timeout_reason,
age_hours}
- 14 new tests: 3 config, 7 service, 4 scheduler
This commit is contained in:
@@ -156,17 +156,20 @@ func (m *mockCertRepo) AddCert(cert *domain.ManagedCertificate) {
|
||||
|
||||
// mockJobRepo is a test implementation of JobRepository
|
||||
type mockJobRepo struct {
|
||||
mu sync.Mutex
|
||||
Jobs map[string]*domain.Job
|
||||
StatusUpdates map[string]domain.JobStatus
|
||||
CreateErr error
|
||||
UpdateErr error
|
||||
UpdateStatusErr error
|
||||
GetErr error
|
||||
ListErr error
|
||||
ListByStatusErr error
|
||||
DeleteErr error
|
||||
Updated []*domain.Job
|
||||
mu sync.Mutex
|
||||
Jobs map[string]*domain.Job
|
||||
StatusUpdates map[string]domain.JobStatus
|
||||
CreateErr error
|
||||
UpdateErr error
|
||||
UpdateErrorByID map[string]error
|
||||
UpdateErrorByIDMu sync.Mutex
|
||||
UpdateStatusErr error
|
||||
GetErr error
|
||||
ListErr error
|
||||
ListByStatusErr error
|
||||
DeleteErr error
|
||||
ListTimedOutErr error
|
||||
Updated []*domain.Job
|
||||
}
|
||||
|
||||
func (m *mockJobRepo) List(ctx context.Context) ([]*domain.Job, error) {
|
||||
@@ -211,6 +214,13 @@ func (m *mockJobRepo) Update(ctx context.Context, job *domain.Job) error {
|
||||
if m.UpdateErr != nil {
|
||||
return m.UpdateErr
|
||||
}
|
||||
// Check per-ID error injection
|
||||
m.UpdateErrorByIDMu.Lock()
|
||||
idErr, ok := m.UpdateErrorByID[job.ID]
|
||||
m.UpdateErrorByIDMu.Unlock()
|
||||
if ok && idErr != nil {
|
||||
return idErr
|
||||
}
|
||||
m.Jobs[job.ID] = job
|
||||
m.Updated = append(m.Updated, job)
|
||||
return nil
|
||||
@@ -352,6 +362,30 @@ func (m *mockJobRepo) ClaimPendingByAgentID(ctx context.Context, agentID string)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// ListTimedOutAwaitingJobs returns jobs stuck in AwaitingCSR/AwaitingApproval past the
|
||||
// respective cutoffs. I-003 coverage-gap closure.
|
||||
func (m *mockJobRepo) ListTimedOutAwaitingJobs(ctx context.Context, csrCutoff, approvalCutoff time.Time) ([]*domain.Job, error) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
if m.ListTimedOutErr != nil {
|
||||
return nil, m.ListTimedOutErr
|
||||
}
|
||||
var jobs []*domain.Job
|
||||
for _, j := range m.Jobs {
|
||||
switch j.Status {
|
||||
case domain.JobStatusAwaitingCSR:
|
||||
if j.CreatedAt.Before(csrCutoff) {
|
||||
jobs = append(jobs, j)
|
||||
}
|
||||
case domain.JobStatusAwaitingApproval:
|
||||
if j.CreatedAt.Before(approvalCutoff) {
|
||||
jobs = append(jobs, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
func (m *mockJobRepo) AddJob(job *domain.Job) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
Reference in New Issue
Block a user