I-003: job timeout reaper closes AwaitingCSR/AwaitingApproval gap

Add 11th always-on scheduler loop that transitions jobs stuck in
AwaitingCSR (default 24h TTL) or AwaitingApproval (default 168h TTL)
to Failed. I-001's retry loop then auto-promotes eligible Failed jobs
back to Pending. No new status enum, no schema migration.

- JobRepository.ListTimedOutAwaitingJobs with per-status cutoff WHERE
- JobService.ReapTimedOutJobs mirrors RetryFailedJobs structure
- Scheduler jobTimeoutLoop with atomic.Bool idempotency guard, 2m
  per-tick context, WaitGroup shutdown drain
- Config: CERTCTL_JOB_TIMEOUT_INTERVAL (10m), CERTCTL_JOB_AWAITING_CSR_TIMEOUT
  (24h), CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT (168h)
- Audit event per transition: actor=system, actorType=System,
  action=job_timeout, details={old_status, new_status, timeout_reason,
  age_hours}
- 14 new tests: 3 config, 7 service, 4 scheduler
This commit is contained in:
Shankar
2026-04-19 01:04:56 +00:00
parent 0d7d933e91
commit c17ea577e7
12 changed files with 1020 additions and 14 deletions
+38
View File
@@ -715,6 +715,29 @@ type SchedulerConfig struct {
// had no caller prior to this loop being wired).
// Setting: CERTCTL_SCHEDULER_RETRY_INTERVAL environment variable.
RetryInterval time.Duration
// JobTimeoutInterval is how often the reaper loop sweeps AwaitingCSR and
// AwaitingApproval jobs for TTL expiration. Default: 10 minutes. Minimum: 1
// second. Timed-out jobs are transitioned to Failed with a descriptive error
// message; I-001's retry loop then auto-promotes eligible Failed jobs back
// to Pending (closes coverage gap I-003).
// Setting: CERTCTL_JOB_TIMEOUT_INTERVAL environment variable.
JobTimeoutInterval time.Duration
// AwaitingCSRTimeout is the maximum age an AwaitingCSR job can remain in
// that state before the reaper transitions it to Failed. Default: 24 hours.
// An agent that hasn't submitted a CSR within this window is presumed
// unreachable. Minimum: 1 second.
// Setting: CERTCTL_JOB_AWAITING_CSR_TIMEOUT environment variable.
AwaitingCSRTimeout time.Duration
// AwaitingApprovalTimeout is the maximum age an AwaitingApproval job can
// remain in that state before the reaper transitions it to Failed. Default:
// 168 hours (7 days). Reviewers who haven't approved within this window
// force the renewal to fail loudly rather than silently stall. Minimum: 1
// second.
// Setting: CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT environment variable.
AwaitingApprovalTimeout time.Duration
}
// LogConfig contains logging configuration.
@@ -816,6 +839,9 @@ func Load() (*Config, error) {
AgentHealthCheckInterval: getEnvDuration("CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL", 2*time.Minute),
NotificationProcessInterval: getEnvDuration("CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL", 1*time.Minute),
RetryInterval: getEnvDuration("CERTCTL_SCHEDULER_RETRY_INTERVAL", 5*time.Minute),
JobTimeoutInterval: getEnvDuration("CERTCTL_JOB_TIMEOUT_INTERVAL", 10*time.Minute),
AwaitingCSRTimeout: getEnvDuration("CERTCTL_JOB_AWAITING_CSR_TIMEOUT", 24*time.Hour),
AwaitingApprovalTimeout: getEnvDuration("CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT", 168*time.Hour),
},
Log: LogConfig{
Level: getEnv("CERTCTL_LOG_LEVEL", "info"),
@@ -1087,6 +1113,18 @@ func (c *Config) Validate() error {
return fmt.Errorf("retry interval must be at least 1 second")
}
if c.Scheduler.JobTimeoutInterval < 1*time.Second {
return fmt.Errorf("job timeout interval must be at least 1 second")
}
if c.Scheduler.AwaitingCSRTimeout < 1*time.Second {
return fmt.Errorf("awaiting CSR timeout must be at least 1 second")
}
if c.Scheduler.AwaitingApprovalTimeout < 1*time.Second {
return fmt.Errorf("awaiting approval timeout must be at least 1 second")
}
return nil
}
+124
View File
@@ -4,6 +4,7 @@ import (
"log/slog"
"os"
"testing"
"strings"
"time"
)
@@ -329,6 +330,9 @@ func TestValidate_ValidConfig(t *testing.T) {
AgentHealthCheckInterval: 2 * time.Minute,
NotificationProcessInterval: 1 * time.Minute,
RetryInterval: 5 * time.Minute,
JobTimeoutInterval: 10 * time.Minute,
AwaitingCSRTimeout: 24 * time.Hour,
AwaitingApprovalTimeout: 168 * time.Hour,
},
}
if err := cfg.Validate(); err != nil {
@@ -349,6 +353,9 @@ func TestValidate_AuthTypeNone(t *testing.T) {
AgentHealthCheckInterval: 2 * time.Minute,
NotificationProcessInterval: 1 * time.Minute,
RetryInterval: 5 * time.Minute,
JobTimeoutInterval: 10 * time.Minute,
AwaitingCSRTimeout: 24 * time.Hour,
AwaitingApprovalTimeout: 168 * time.Hour,
},
}
if err := cfg.Validate(); err != nil {
@@ -708,3 +715,120 @@ func TestGetEnvBool(t *testing.T) {
})
}
}
// I-003: Job timeout reaper configuration tests
func TestConfig_Scheduler_JobTimeoutDefaults(t *testing.T) {
clearCertctlEnv(t)
setMinimalValidEnv(t)
// Explicitly unset the three I-003 env vars to exercise the default path.
t.Setenv("CERTCTL_JOB_TIMEOUT_INTERVAL", "")
t.Setenv("CERTCTL_JOB_AWAITING_CSR_TIMEOUT", "")
t.Setenv("CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT", "")
cfg, err := Load()
if err != nil {
t.Fatalf("Load() error: %v", err)
}
if cfg.Scheduler.JobTimeoutInterval != 10*time.Minute {
t.Errorf("JobTimeoutInterval = %v, want 10m", cfg.Scheduler.JobTimeoutInterval)
}
if cfg.Scheduler.AwaitingCSRTimeout != 24*time.Hour {
t.Errorf("AwaitingCSRTimeout = %v, want 24h", cfg.Scheduler.AwaitingCSRTimeout)
}
if cfg.Scheduler.AwaitingApprovalTimeout != 168*time.Hour {
t.Errorf("AwaitingApprovalTimeout = %v, want 168h", cfg.Scheduler.AwaitingApprovalTimeout)
}
}
func TestConfig_Scheduler_JobTimeoutEnvOverride(t *testing.T) {
clearCertctlEnv(t)
setMinimalValidEnv(t)
t.Setenv("CERTCTL_JOB_TIMEOUT_INTERVAL", "15m")
t.Setenv("CERTCTL_JOB_AWAITING_CSR_TIMEOUT", "48h")
t.Setenv("CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT", "336h")
cfg, err := Load()
if err != nil {
t.Fatalf("Load() error: %v", err)
}
if cfg.Scheduler.JobTimeoutInterval != 15*time.Minute {
t.Errorf("JobTimeoutInterval = %v, want 15m", cfg.Scheduler.JobTimeoutInterval)
}
if cfg.Scheduler.AwaitingCSRTimeout != 48*time.Hour {
t.Errorf("AwaitingCSRTimeout = %v, want 48h", cfg.Scheduler.AwaitingCSRTimeout)
}
if cfg.Scheduler.AwaitingApprovalTimeout != 336*time.Hour {
t.Errorf("AwaitingApprovalTimeout = %v, want 336h", cfg.Scheduler.AwaitingApprovalTimeout)
}
}
func TestConfig_Scheduler_JobTimeoutValidation(t *testing.T) {
tests := []struct {
name string
field string
value time.Duration
wantErrMsg string
}{
{
"JobTimeoutInterval too small",
"JobTimeoutInterval",
500 * time.Millisecond,
"job timeout interval must be at least 1 second",
},
{
"AwaitingCSRTimeout too small",
"AwaitingCSRTimeout",
500 * time.Millisecond,
"awaiting CSR timeout must be at least 1 second",
},
{
"AwaitingApprovalTimeout too small",
"AwaitingApprovalTimeout",
500 * time.Millisecond,
"awaiting approval timeout must be at least 1 second",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Start from a fully valid config so the I-003 timeout checks
// are the only potential failure point.
cfg := &Config{
Server: ServerConfig{Port: 8080},
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
Log: LogConfig{Level: "info", Format: "json"},
Auth: AuthConfig{Type: "api-key", Secret: "test-secret"},
Keygen: KeygenConfig{Mode: "agent"},
Scheduler: SchedulerConfig{
RenewalCheckInterval: 1 * time.Minute,
JobProcessorInterval: 1 * time.Minute,
AgentHealthCheckInterval: 1 * time.Minute,
NotificationProcessInterval: 1 * time.Minute,
RetryInterval: 1 * time.Minute,
JobTimeoutInterval: 10 * time.Minute,
AwaitingCSRTimeout: 24 * time.Hour,
AwaitingApprovalTimeout: 168 * time.Hour,
},
}
// Override the specific field under test
switch tt.field {
case "JobTimeoutInterval":
cfg.Scheduler.JobTimeoutInterval = tt.value
case "AwaitingCSRTimeout":
cfg.Scheduler.AwaitingCSRTimeout = tt.value
case "AwaitingApprovalTimeout":
cfg.Scheduler.AwaitingApprovalTimeout = tt.value
}
err := cfg.Validate()
if err == nil {
t.Fatalf("Validate() = nil, want error containing %q", tt.wantErrMsg)
}
if !strings.Contains(err.Error(), tt.wantErrMsg) {
t.Errorf("Validate() error = %q, want to contain %q", err.Error(), tt.wantErrMsg)
}
})
}
}