From 15fedbaa0614a9a1417af1117a5ae5790799543f Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Sat, 16 May 2026 04:15:51 +0000 Subject: [PATCH] =?UTF-8?q?test(scheduler):=20SCALE-001=20=E2=80=94=20asse?= =?UTF-8?q?rt=20claim=20cap=20via=20non-Pending=20count,=20not=20Running?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sprint 2's TestProcessPendingJobs_RespectsClaimLimit asserted that exactly 3 jobs sat in JobStatusRunning after a 10-row ProcessPendingJobs sweep with SetClaimLimit(3). The CI run landed 'running-job count = 0; want 3.' Root cause: the mock's ClaimPendingJobs flips Pending → Running on the 3 claimed rows (atomic-claim semantics). processJob then calls renewalService.ProcessRenewalJob, which fails on the mock cert-repo's not-found error and calls failJob → which transitions the row from Running → Failed. By the time the test assertion runs, no row is still in Running. The load-bearing SCALE-001 invariant is 'the cap STOPPED at 3.' Whether the 3 claimed rows ended up Running, Failed, or Completed is irrelevant to the cap — what matters is that 7 rows STAYED in Pending for the next tick. Fix: count non-Pending (= claimed) and still-Pending (= 10 minus claimed) separately. Assert claimed=3 and stillPending=7. LastClaimLimit=3 assertion (already passing in the failed run) also stays as the seam-propagation pin. This is a test-fix only — the SCALE-001 production behavior landed correctly in 037876f and is proven by the CI log line 'count=3 claim_limit=3'. --- internal/service/job_test.go | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/internal/service/job_test.go b/internal/service/job_test.go index beeb581..f5f3785 100644 --- a/internal/service/job_test.go +++ b/internal/service/job_test.go @@ -982,18 +982,27 @@ func TestProcessPendingJobs_RespectsClaimLimit(t *testing.T) { t.Errorf("LastClaimLimit = %d; want 3 (SetClaimLimit must propagate)", jobRepo.LastClaimLimit) } - // Count how many transitioned from Pending → Running via the mock's - // atomic-claim behaviour. + // The mock's ClaimPendingJobs flips claimed rows Pending → Running. + // processJob then runs and (since these rows reference cert IDs the + // mock cert-repo doesn't know about) transitions them to Failed. + // The load-bearing invariant for SCALE-001 is: the claim cap STOPPED + // at 3, so exactly 3 rows left the Pending pool and the remaining 7 + // stayed Pending for the next tick. jobRepo.mu.Lock() defer jobRepo.mu.Unlock() - var running int + var stillPending, claimed int for _, j := range jobRepo.Jobs { - if j.Status == domain.JobStatusRunning { - running++ + if j.Status == domain.JobStatusPending { + stillPending++ + } else { + claimed++ } } - if running != 3 { - t.Errorf("running-job count = %d; want 3 (claim cap should have stopped after 3)", running) + if claimed != 3 { + t.Errorf("claimed (non-Pending) count = %d; want 3 (claim cap should have stopped after 3)", claimed) + } + if stillPending != 7 { + t.Errorf("still-Pending count = %d; want 7 (the cap should have left the rest untouched)", stillPending) } }