I-003: job timeout reaper closes AwaitingCSR/AwaitingApproval gap

Add 11th always-on scheduler loop that transitions jobs stuck in
AwaitingCSR (default 24h TTL) or AwaitingApproval (default 168h TTL)
to Failed. I-001's retry loop then auto-promotes eligible Failed jobs
back to Pending. No new status enum, no schema migration.

- JobRepository.ListTimedOutAwaitingJobs with per-status cutoff WHERE
- JobService.ReapTimedOutJobs mirrors RetryFailedJobs structure
- Scheduler jobTimeoutLoop with atomic.Bool idempotency guard, 2m
  per-tick context, WaitGroup shutdown drain
- Config: CERTCTL_JOB_TIMEOUT_INTERVAL (10m), CERTCTL_JOB_AWAITING_CSR_TIMEOUT
  (24h), CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT (168h)
- Audit event per transition: actor=system, actorType=System,
  action=job_timeout, details={old_status, new_status, timeout_reason,
  age_hours}
- 14 new tests: 3 config, 7 service, 4 scheduler
This commit is contained in:
Shankar
2026-04-19 01:04:56 +00:00
parent 0d7d933e91
commit c17ea577e7
12 changed files with 1020 additions and 14 deletions
+36
View File
@@ -4,6 +4,7 @@ import (
"context"
"database/sql"
"fmt"
"time"
"github.com/google/uuid"
"github.com/shankar0123/certctl/internal/domain"
@@ -570,6 +571,41 @@ func (r *JobRepository) ClaimPendingByAgentID(ctx context.Context, agentID strin
return append(pendingJobs, csrJobs...), nil
}
// ListTimedOutAwaitingJobs returns jobs stuck in AwaitingCSR or AwaitingApproval past
// their respective cutoff timestamps (created_at < cutoff). The reaper loop transitions
// them to Failed; I-001's retry loop then auto-promotes eligible Failed jobs back to
// Pending. I-003 coverage-gap closure.
func (r *JobRepository) ListTimedOutAwaitingJobs(ctx context.Context, csrCutoff, approvalCutoff time.Time) ([]*domain.Job, error) {
rows, err := r.db.QueryContext(ctx, `
SELECT id, type, certificate_id, target_id, agent_id, status, attempts, max_attempts,
last_error, scheduled_at, started_at, completed_at, created_at
FROM jobs
WHERE (status = $1 AND created_at < $2)
OR (status = $3 AND created_at < $4)
ORDER BY created_at ASC
`, domain.JobStatusAwaitingCSR, csrCutoff, domain.JobStatusAwaitingApproval, approvalCutoff)
if err != nil {
return nil, fmt.Errorf("failed to query timed-out awaiting jobs: %w", err)
}
defer rows.Close()
var jobs []*domain.Job
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
return nil, err
}
jobs = append(jobs, job)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("error iterating timed-out job rows: %w", err)
}
return jobs, nil
}
// scanJob scans a job from a row or rows
func scanJob(scanner interface {
Scan(...interface{}) error