Files
certctl/internal/service/verification_test.go
T
shankar0123 89b910a8f1 security: atomic pending-job claim with FOR UPDATE SKIP LOCKED (H-6)
Fixes H-6 (CWE-362) — GetPendingJobs returned pending rows without row
locks, so two scheduler replicas in an HA deployment could both read the
same row, both decide it was theirs, and race on UpdateStatus, producing
duplicate Running jobs and duplicate certificate issuances.

Remediation: a claim-style repository API that selects + transitions
Pending -> Running in one transaction with SELECT ... FOR UPDATE SKIP
LOCKED. Concurrent claimants observe disjoint row sets; no worker ever
sees another worker's claimed row.

Repository changes (internal/repository/postgres/job.go):
  - New ClaimPendingJobs(ctx, jobType, limit): BEGIN; SELECT id,...
    FROM jobs WHERE status='Pending' (optional type filter, optional
    LIMIT) FOR UPDATE SKIP LOCKED; UPDATE jobs SET status='Running',
    updated_at=NOW() WHERE id = ANY($ids); COMMIT. Returns the claimed
    rows with status already flipped.
  - New ClaimPendingByAgentID(ctx, agentID): mirrors M31 UNION ALL
    semantics (direct agent_id match, target->agent JOIN fallback,
    certificate->target->agent chain for AwaitingCSR) but wraps each
    branch in FOR UPDATE SKIP LOCKED and flips Deployment/Renewal rows
    to Running. AwaitingCSR rows are returned in place (state
    transition deferred until SubmitCSR, consistent with M8 semantics).
  - Existing GetPendingJobs / ListPendingByAgentID retained for legacy
    compatibility; their godoc now directs production callers to the
    Claim* variants.

Production caller switches:
  - internal/service/job.go ProcessPendingJobs: ListByStatus(Pending)
    -> ClaimPendingJobs(ctx, "", 0). Eliminates the real scheduler
    race between two replicas tick-firing simultaneously.
  - internal/service/agent.go GetPendingWork: ListPendingByAgentID ->
    ClaimPendingByAgentID. Eliminates the race between two pollers
    for the same agent (e.g. brief network blip causing duplicate
    poll) and between a scheduler tick and an agent poll.

Safety argument for pre-flipping Pending -> Running inside the claim
transaction: ProcessRenewalJob and ProcessDeploymentJob both call
UpdateStatus(Running) unconditionally on entry, so an early flip is
idempotent. On panic, the scheduler's panic recovery leaves the job
in Running which the existing stale-running reaper handles.

Tests (internal/repository/postgres/repo_test.go, skipped in -short):
  - TestJobRepository_ClaimPendingJobs_FlipsToRunning: seed 5 Pending,
    claim once, assert all 5 returned + DB rows Running, residual
    claim returns 0.
  - TestJobRepository_ClaimPendingJobs_ConcurrentDisjoint: seed M=40
    Pending Renewals, spawn N=8 goroutines each calling
    ClaimPendingJobs(_, JobTypeRenewal, 1) in a loop. Invariants:
    (a) no job ID claimed by more than one worker, (b) sum of claims
    == 40, (c) all 40 rows in Running state in the DB. Bounded
    empty-streak guard (20 iterations) covers SKIP LOCKED transient
    zeros under contention.
  - TestJobRepository_ClaimPendingByAgentID_TransitionsDeployments:
    seeds 2 Pending Deployment + 1 AwaitingCSR for agent A plus 1
    Pending Renewal for agent B (scope check). Asserts deployments
    flip to Running, AwaitingCSR is returned but preserved, agent B's
    renewal never appears.

Mock updates: testutil_test.go, lifecycle_test.go, verification_test.go
gained ClaimPendingJobs/ClaimPendingByAgentID on their mock job repos
mirroring the real Pending -> Running semantics. Mocks intentionally
do NOT write to StatusUpdates (that map tracks UpdateStatus() call
history specifically; the real claim path uses a bulk UPDATE, not
UpdateStatus).

Verification (CI-scope):
  - go build ./cmd/...: ok
  - go vet ./...: ok
  - go test -race -short on service, api/handler, api/middleware,
    scheduler, connector/..., domain, validation, tlsprobe: ok
  - Coverage gates: service 67.6% (>=55), handler 78.6% (>=60),
    middleware 80.0% (>=30), domain 92.7% (>=40). All hold.
  - golangci-lint 2.11.4: 0 issues
  - govulncheck: no vulnerabilities in call graph
  - Frontend: tsc clean, 218 vitest tests pass, vite build ok
  - helm lint + helm template: ok
  - Invariant sweeps: FOR UPDATE SKIP LOCKED present in job.go;
    H-1 through H-5 fixtures unchanged.

Refs: H-6 in certctl-audit-report.md
2026-04-17 02:34:56 +00:00

281 lines
7.7 KiB
Go

package service
import (
"context"
"errors"
"log/slog"
"testing"
"time"
"github.com/shankar0123/certctl/internal/domain"
)
// mockVerificationJobRepo is a test double for JobRepository used by verification tests.
type mockVerificationJobRepo struct {
jobs map[string]*domain.Job
err error
}
func (m *mockVerificationJobRepo) Get(ctx context.Context, id string) (*domain.Job, error) {
if m.err != nil {
return nil, m.err
}
job, ok := m.jobs[id]
if !ok {
return nil, errors.New("job not found")
}
return job, nil
}
func (m *mockVerificationJobRepo) Create(ctx context.Context, job *domain.Job) error {
m.jobs[job.ID] = job
return nil
}
func (m *mockVerificationJobRepo) Update(ctx context.Context, job *domain.Job) error {
if m.err != nil {
return m.err
}
m.jobs[job.ID] = job
return nil
}
func (m *mockVerificationJobRepo) List(ctx context.Context) ([]*domain.Job, error) {
return nil, nil
}
func (m *mockVerificationJobRepo) Delete(ctx context.Context, id string) error {
delete(m.jobs, id)
return nil
}
func (m *mockVerificationJobRepo) ListByStatus(ctx context.Context, status domain.JobStatus) ([]*domain.Job, error) {
return nil, nil
}
func (m *mockVerificationJobRepo) ListByCertificate(ctx context.Context, certID string) ([]*domain.Job, error) {
return nil, nil
}
func (m *mockVerificationJobRepo) UpdateStatus(ctx context.Context, id string, status domain.JobStatus, errMsg string) error {
return nil
}
func (m *mockVerificationJobRepo) GetPendingJobs(ctx context.Context, jobType domain.JobType) ([]*domain.Job, error) {
return nil, nil
}
func (m *mockVerificationJobRepo) ListPendingByAgentID(ctx context.Context, agentID string) ([]*domain.Job, error) {
return nil, nil
}
func (m *mockVerificationJobRepo) ClaimPendingJobs(ctx context.Context, jobType domain.JobType, limit int) ([]*domain.Job, error) {
return nil, nil
}
func (m *mockVerificationJobRepo) ClaimPendingByAgentID(ctx context.Context, agentID string) ([]*domain.Job, error) {
return nil, nil
}
// newVerificationTestService creates a VerificationService wired with test doubles.
func newVerificationTestService(jobs map[string]*domain.Job, jobRepoErr error) (*VerificationService, *mockVerificationJobRepo, *mockAuditRepo) {
jobRepo := &mockVerificationJobRepo{jobs: jobs, err: jobRepoErr}
auditRepo := newMockAuditRepository()
auditService := NewAuditService(auditRepo)
svc := NewVerificationService(jobRepo, auditService, slog.Default())
return svc, jobRepo, auditRepo
}
func TestVerificationService_RecordVerificationResult_Success(t *testing.T) {
ctx := context.Background()
jobs := map[string]*domain.Job{
"j-test1": {
ID: "j-test1",
Status: domain.JobStatusCompleted,
},
}
svc, jobRepo, auditRepo := newVerificationTestService(jobs, nil)
result := &domain.VerificationResult{
JobID: "j-test1",
TargetID: "t-nginx1",
ExpectedFingerprint: "abc123",
ActualFingerprint: "abc123",
Verified: true,
VerifiedAt: time.Now().UTC(),
}
err := svc.RecordVerificationResult(ctx, result)
if err != nil {
t.Errorf("unexpected error: %v", err)
}
// Check job was updated
job, _ := jobRepo.Get(ctx, "j-test1")
if job.VerificationStatus != domain.VerificationSuccess {
t.Errorf("expected VerificationSuccess, got %s", job.VerificationStatus)
}
if job.VerifiedAt == nil {
t.Error("expected verified_at to be set")
}
// Check audit event was recorded
if len(auditRepo.Events) == 0 {
t.Error("expected at least 1 audit event")
}
}
func TestVerificationService_RecordVerificationResult_Failed(t *testing.T) {
ctx := context.Background()
jobs := map[string]*domain.Job{
"j-test2": {
ID: "j-test2",
Status: domain.JobStatusCompleted,
},
}
svc, jobRepo, _ := newVerificationTestService(jobs, nil)
result := &domain.VerificationResult{
JobID: "j-test2",
TargetID: "t-apache1",
ExpectedFingerprint: "aaa111",
ActualFingerprint: "bbb222",
Verified: false,
VerifiedAt: time.Now().UTC(),
}
err := svc.RecordVerificationResult(ctx, result)
if err != nil {
t.Errorf("unexpected error: %v", err)
}
job, _ := jobRepo.Get(ctx, "j-test2")
if job.VerificationStatus != domain.VerificationFailed {
t.Errorf("expected VerificationFailed, got %s", job.VerificationStatus)
}
}
func TestVerificationService_RecordVerificationResult_WithError(t *testing.T) {
ctx := context.Background()
jobs := map[string]*domain.Job{
"j-test3": {
ID: "j-test3",
Status: domain.JobStatusCompleted,
},
}
svc, jobRepo, _ := newVerificationTestService(jobs, nil)
result := &domain.VerificationResult{
JobID: "j-test3",
TargetID: "t-haproxy1",
VerifiedAt: time.Now().UTC(),
Error: "connection refused",
}
err := svc.RecordVerificationResult(ctx, result)
if err != nil {
t.Errorf("unexpected error: %v", err)
}
job, _ := jobRepo.Get(ctx, "j-test3")
if job.VerificationStatus != domain.VerificationFailed {
t.Errorf("expected VerificationFailed, got %s", job.VerificationStatus)
}
if job.VerificationError == nil || *job.VerificationError != "connection refused" {
t.Error("expected verification error to be set")
}
}
func TestVerificationService_RecordVerificationResult_JobNotFound(t *testing.T) {
ctx := context.Background()
svc, _, _ := newVerificationTestService(map[string]*domain.Job{}, nil)
result := &domain.VerificationResult{
JobID: "j-nonexistent",
TargetID: "t-nginx1",
VerifiedAt: time.Now().UTC(),
}
err := svc.RecordVerificationResult(ctx, result)
if err == nil {
t.Error("expected error for nonexistent job")
}
}
func TestVerificationService_RecordVerificationResult_MissingJobID(t *testing.T) {
ctx := context.Background()
svc, _, _ := newVerificationTestService(map[string]*domain.Job{}, nil)
result := &domain.VerificationResult{
TargetID: "t-nginx1",
VerifiedAt: time.Now().UTC(),
}
err := svc.RecordVerificationResult(ctx, result)
if err == nil {
t.Error("expected error for missing job ID")
}
}
func TestVerificationService_RecordVerificationResult_NilResult(t *testing.T) {
ctx := context.Background()
svc, _, _ := newVerificationTestService(map[string]*domain.Job{}, nil)
err := svc.RecordVerificationResult(ctx, nil)
if err == nil {
t.Error("expected error for nil result")
}
}
func TestVerificationService_GetVerificationResult_Success(t *testing.T) {
ctx := context.Background()
now := time.Now().UTC()
targetID := "t-nginx1"
fp := "abc123"
jobs := map[string]*domain.Job{
"j-test1": {
ID: "j-test1",
TargetID: &targetID,
VerificationStatus: domain.VerificationSuccess,
VerifiedAt: &now,
VerificationFp: &fp,
},
}
svc, _, _ := newVerificationTestService(jobs, nil)
result, err := svc.GetVerificationResult(ctx, "j-test1")
if err != nil {
t.Errorf("unexpected error: %v", err)
}
if result.JobID != "j-test1" {
t.Errorf("expected job ID j-test1, got %s", result.JobID)
}
if !result.Verified {
t.Error("expected Verified to be true")
}
if result.ActualFingerprint != "abc123" {
t.Errorf("expected fingerprint abc123, got %s", result.ActualFingerprint)
}
}
func TestVerificationService_GetVerificationResult_NotFound(t *testing.T) {
ctx := context.Background()
svc, _, _ := newVerificationTestService(map[string]*domain.Job{}, nil)
_, err := svc.GetVerificationResult(ctx, "j-nonexistent")
if err == nil {
t.Error("expected error for nonexistent job")
}
}
func TestVerificationService_GetVerificationResult_EmptyJobID(t *testing.T) {
ctx := context.Background()
svc, _, _ := newVerificationTestService(map[string]*domain.Job{}, nil)
_, err := svc.GetVerificationResult(ctx, "")
if err == nil {
t.Error("expected error for empty job ID")
}
}