Files
certctl/internal/service/job.go
T
shankar0123 ccd89c348f fix(m2-pr-d): thread ctx through Job/Notification/Audit services
Collapse CancelJobWithContext into CancelJob; eliminate 10 context.Background()
hits across the Job+Notification+Audit service cluster by threading ctx
through their handler-facing service interfaces.

Services (ctx-first):
- service/job.go: ListJobs, GetJob, CancelJob, ApproveJob, RejectJob now
  accept ctx; the CancelJobWithContext wrapper is removed (handler callers
  continue to invoke CancelJob, now ctx-aware).
- service/notification.go: ListNotifications, GetNotification, MarkAsRead
  accept ctx.
- service/audit.go: ListAuditEvents, GetAuditEvent accept ctx.

Handlers (interface + callsites):
- handler/jobs.go, handler/notifications.go, handler/audit.go: local
  service interfaces updated, r.Context() threaded at every callsite.

Tests:
- Mock services updated to match the new interfaces (ctx accepted and
  ignored via '_ context.Context' first parameter; Fn closure fields
  unchanged).
- job_test.go / notification_test.go callsites thread context.Background()
  to match production shape.

Verification:
  go build ./...                 ok
  go vet ./...                   ok
  go test -short ./...           ok
  go test -race -short ./...     ok
  golangci-lint run ./...        0 issues

Locked decisions from the M-2 plan:
  D-1 ctx-only signatures (no dual forms)
  D-4 preserve handler method names facing the router
  D-5 domain types stay ctx-free

Audit complete. Commit: 1f6cf0eafa. Sections: 12. Findings: 2/7/10/4/6.
2026-04-18 01:20:46 +00:00

309 lines
9.5 KiB
Go

package service
import (
"context"
"fmt"
"log/slog"
"github.com/shankar0123/certctl/internal/domain"
"github.com/shankar0123/certctl/internal/repository"
)
// JobService manages job processing and status tracking.
// It coordinates between the scheduler and various job-specific services.
type JobService struct {
jobRepo repository.JobRepository
renewalService *RenewalService
deploymentService *DeploymentService
logger *slog.Logger
}
// NewJobService creates a new job service.
func NewJobService(
jobRepo repository.JobRepository,
renewalService *RenewalService,
deploymentService *DeploymentService,
logger *slog.Logger,
) *JobService {
return &JobService{
jobRepo: jobRepo,
renewalService: renewalService,
deploymentService: deploymentService,
logger: logger,
}
}
// ProcessPendingJobs fetches and processes all pending jobs.
// It routes jobs to the appropriate service based on job type and handles errors gracefully.
//
// Concurrency (H-6 CWE-362): jobs are claimed via ClaimPendingJobs which uses
// SELECT ... FOR UPDATE SKIP LOCKED and flips Pending → Running atomically. Concurrent
// scheduler replicas in HA deployments will therefore never observe the same Pending row,
// and the subsequent UpdateStatus(Running) calls inside the downstream service methods are
// idempotent against the pre-flipped state.
func (s *JobService) ProcessPendingJobs(ctx context.Context) error {
// Claim pending jobs atomically (H-6 remediation: was ListByStatus which had no row lock).
// Empty jobType matches all types; zero limit means unlimited (preserves prior semantics).
pendingJobs, err := s.jobRepo.ClaimPendingJobs(ctx, "", 0)
if err != nil {
return fmt.Errorf("failed to claim pending jobs: %w", err)
}
if len(pendingJobs) == 0 {
s.logger.Debug("no pending jobs to process")
return nil
}
s.logger.Info("processing pending jobs", "count", len(pendingJobs))
var processedCount int
var failedCount int
// Process each job
for _, job := range pendingJobs {
// Skip deployment jobs that have an agent_id — those are meant for agent
// pickup via GetPendingWork(), not server-side processing. The server should
// only process deployment jobs without an agent (legacy/serverless targets).
if job.Type == domain.JobTypeDeployment && job.AgentID != nil && *job.AgentID != "" {
s.logger.Debug("skipping agent-routed deployment job",
"job_id", job.ID,
"agent_id", *job.AgentID)
continue
}
if err := s.processJob(ctx, job); err != nil {
s.logger.Error("failed to process job",
"job_id", job.ID,
"job_type", job.Type,
"error", err)
failedCount++
continue
}
processedCount++
}
s.logger.Info("job processing completed",
"processed", processedCount,
"failed", failedCount,
"total", len(pendingJobs))
return nil
}
// processJob routes a single job to the appropriate service based on type.
func (s *JobService) processJob(ctx context.Context, job *domain.Job) error {
s.logger.Debug("processing job",
"job_id", job.ID,
"job_type", job.Type,
"certificate_id", job.CertificateID)
switch job.Type {
case domain.JobTypeRenewal:
return s.renewalService.ProcessRenewalJob(ctx, job)
case domain.JobTypeDeployment:
return s.deploymentService.ProcessDeploymentJob(ctx, job)
case domain.JobTypeIssuance:
return s.processIssuanceJob(ctx, job)
case domain.JobTypeValidation:
return s.processValidationJob(ctx, job)
default:
return fmt.Errorf("unknown job type: %s", job.Type)
}
}
// processIssuanceJob handles a certificate issuance job.
// It reuses the renewal service's ProcessRenewalJob since the flow is identical:
// generate key → create CSR → call issuer → store version → create deployment jobs.
// The only difference is semantics (new cert vs renewed cert), not mechanics.
func (s *JobService) processIssuanceJob(ctx context.Context, job *domain.Job) error {
s.logger.Debug("processing issuance job", "job_id", job.ID)
// Issuance follows the same code path as renewal for the Local CA:
// generate server-side key + CSR → sign via issuer → store cert version → deploy
return s.renewalService.ProcessRenewalJob(ctx, job)
}
// processValidationJob handles a certificate validation job.
// This is a placeholder that documents the flow.
// TODO: Implement actual validation job processing if needed.
func (s *JobService) processValidationJob(ctx context.Context, job *domain.Job) error {
s.logger.Debug("processing validation job", "job_id", job.ID)
// TODO: Implement validation job processing
// In production:
// 1. Fetch the certificate
// 2. For each target, call target connector ValidateDeployment
// 3. Aggregate results
// 4. Update job status based on results
// 5. Send notification if any validation fails
return fmt.Errorf("validation job processing not yet implemented")
}
// RetryFailedJobs finds failed jobs and resets them for retry.
// It only retries jobs that haven't exceeded max attempts.
func (s *JobService) RetryFailedJobs(ctx context.Context, maxRetries int) error {
s.logger.Debug("retrying failed jobs", "max_retries", maxRetries)
failedJobs, err := s.jobRepo.ListByStatus(ctx, domain.JobStatusFailed)
if err != nil {
return fmt.Errorf("failed to fetch failed jobs: %w", err)
}
var retriedCount int
for _, job := range failedJobs {
// Check if we can retry (Attempts < MaxAttempts)
if job.Attempts >= job.MaxAttempts {
s.logger.Debug("job exceeded max retries",
"job_id", job.ID,
"attempts", job.Attempts,
"max_attempts", job.MaxAttempts)
continue
}
// Reset status to pending for retry
if err := s.jobRepo.UpdateStatus(ctx, job.ID, domain.JobStatusPending, ""); err != nil {
s.logger.Error("failed to reset job status for retry",
"job_id", job.ID,
"error", err)
continue
}
retriedCount++
}
s.logger.Info("failed jobs retry completed",
"retried", retriedCount,
"total_failed", len(failedJobs))
return nil
}
// GetJobStatus returns the current status of a job.
func (s *JobService) GetJobStatus(ctx context.Context, jobID string) (*domain.Job, error) {
job, err := s.jobRepo.Get(ctx, jobID)
if err != nil {
return nil, fmt.Errorf("failed to fetch job: %w", err)
}
return job, nil
}
// CancelJob cancels a pending or running job (handler interface method).
func (s *JobService) CancelJob(ctx context.Context, jobID string) error {
job, err := s.jobRepo.Get(ctx, jobID)
if err != nil {
return fmt.Errorf("failed to fetch job: %w", err)
}
if job.Status != domain.JobStatusPending && job.Status != domain.JobStatusRunning {
return fmt.Errorf("cannot cancel job with status %s", job.Status)
}
if err := s.jobRepo.UpdateStatus(ctx, jobID, domain.JobStatusCancelled, "cancelled by user"); err != nil {
return fmt.Errorf("failed to cancel job: %w", err)
}
s.logger.Info("job cancelled", "job_id", jobID)
return nil
}
// ListJobs returns paginated jobs with optional filtering (handler interface method).
func (s *JobService) ListJobs(ctx context.Context, status, jobType string, page, perPage int) ([]domain.Job, int64, error) {
if page < 1 {
page = 1
}
if perPage < 1 {
perPage = 50
}
allJobs, err := s.jobRepo.List(ctx)
if err != nil {
return nil, 0, fmt.Errorf("failed to list jobs: %w", err)
}
// Filter jobs in memory based on status and jobType
var filtered []*domain.Job
for _, job := range allJobs {
if job == nil {
continue
}
if status != "" && string(job.Status) != status {
continue
}
if jobType != "" && string(job.Type) != jobType {
continue
}
filtered = append(filtered, job)
}
total := int64(len(filtered))
start := (page - 1) * perPage
if start >= int(total) {
return nil, total, nil
}
end := start + perPage
if end > int(total) {
end = int(total)
}
var result []domain.Job
for _, job := range filtered[start:end] {
if job != nil {
result = append(result, *job)
}
}
return result, total, nil
}
// GetJob returns a single job (handler interface method).
func (s *JobService) GetJob(ctx context.Context, id string) (*domain.Job, error) {
return s.jobRepo.Get(ctx, id)
}
// ApproveJob approves a renewal job that is awaiting approval.
// Transitions the job from AwaitingApproval to Pending so the scheduler picks it up.
func (s *JobService) ApproveJob(ctx context.Context, id string) error {
job, err := s.jobRepo.Get(ctx, id)
if err != nil {
return fmt.Errorf("job not found: %w", err)
}
if job.Status != domain.JobStatusAwaitingApproval {
return fmt.Errorf("cannot approve job with status %s (must be AwaitingApproval)", job.Status)
}
if err := s.jobRepo.UpdateStatus(ctx, id, domain.JobStatusPending, ""); err != nil {
return fmt.Errorf("failed to approve job: %w", err)
}
s.logger.Info("renewal job approved", "job_id", id, "certificate_id", job.CertificateID)
return nil
}
// RejectJob rejects a renewal job that is awaiting approval.
// Transitions the job to Cancelled with a rejection reason.
func (s *JobService) RejectJob(ctx context.Context, id string, reason string) error {
job, err := s.jobRepo.Get(ctx, id)
if err != nil {
return fmt.Errorf("job not found: %w", err)
}
if job.Status != domain.JobStatusAwaitingApproval {
return fmt.Errorf("cannot reject job with status %s (must be AwaitingApproval)", job.Status)
}
msg := "rejected by user"
if reason != "" {
msg = "rejected: " + reason
}
if err := s.jobRepo.UpdateStatus(ctx, id, domain.JobStatusCancelled, msg); err != nil {
return fmt.Errorf("failed to reject job: %w", err)
}
s.logger.Info("renewal job rejected", "job_id", id, "certificate_id", job.CertificateID, "reason", reason)
return nil
}