From df23294476effaa32dfc1c94d49d2c70068824ab Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Mon, 4 May 2026 01:01:53 +0000 Subject: [PATCH] service: ApprovalService + ApprovalMetrics + 8 table-driven tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rank 7 of the 2026-05-03 Infisical deep-research deliverable, commit 2 of 4 (cowork/rank-7-approval-workflow-primitive-prompt.md). Builds on the foundation in commit b4d1ad1 — wires the service layer that drives the approval workflow. Still no handler / integration wiring; commits 3-4 land that. Files added: internal/service/approval.go - ApprovalService struct + 6 methods: RequestApproval, Approve, Reject, ListPending, List, Get, ExpireStale. Same-actor RBAC check (ErrApproveBySameActor) at both Approve and Reject; the load-bearing two-person integrity gate. Bypass mode short-circuits via approveInternal(outcome= "bypassed", actorType=System). Audit + metric emission per decision via shared recordAudit helper. Tolerates nil AuditService for tests. Service depends on a narrow JobStatusUpdater interface (single-method) rather than the full repository.JobRepository — production wiring satisfies it implicitly via postgres' existing UpdateStatus. internal/service/approval_metrics.go - ApprovalMetrics: thread-safe counter table (decisions counter dimensioned by outcome × profile_id) + a custom durationHistogram for pending-age (le buckets: 60, 300, 1800, 3600, 21600, 86400, +Inf — 1m, 5m, 30m, 1h, 6h, 24h, beyond). Snapshot* methods return the Prometheus exposer's input shapes. Mirrors the ExpiryAlertMetrics + VaultRenewalMetrics pattern from prior ranks. internal/service/approval_test.go - 8 table-driven tests with tight in-package fakes (fakeApprovalRepo + fakeJobStateRepo): TestApproval_RequestCreatesPendingRow_BypassDisabled TestApproval_BypassMode_AutoApprovesWithSystemBypassActor TestApproval_Approve_TransitionsJobFromAwaitingApprovalToPending TestApproval_Reject_TransitionsJobFromAwaitingApprovalToCancelled TestApproval_Approve_RejectsSameActor ↑ THE LOAD-BEARING TWO-PERSON INTEGRITY TEST. PCI-DSS 6.4.5 / NIST 800-53 SA-15 / SOC 2 CC6.1 compliance auditors pattern-match against this. Pins same-actor rejection on both Approve and Reject paths; pins success when a different actor approves. TestApproval_Approve_RejectsAlreadyDecided TestApproval_ExpireStale_TransitionsPendingToExpired_AndCancelsJob TestApproval_MetricCounterIncrements Verified: gofmt: clean. go vet ./internal/service/...: exit 0. go test -short -count=1 -run TestApproval ./internal/service/...: ok 0.005s — all 8 tests green. Out of scope for this commit (lands in commits 3-4): - api/handler/approval.go (5 endpoints + handler-side RBAC). - api/openapi.yaml extensions. - Integration into CertificateService.TriggerRenewal + RenewalService.CheckExpiringCertificates + Scheduler.ReapTimedOutJobs. - cmd/server/main.go wiring of ApprovalService + ApprovalMetrics. - Config.Approval.BypassEnabled + CERTCTL_APPROVAL_BYPASS env var. - docs/connectors.md row + docs/approval-workflow.md runbook. Reference: cowork/rank-7-approval-workflow-primitive-prompt.md. --- internal/service/approval.go | 372 ++++++++++++++++++++++++++ internal/service/approval_metrics.go | 212 +++++++++++++++ internal/service/approval_test.go | 386 +++++++++++++++++++++++++++ 3 files changed, 970 insertions(+) create mode 100644 internal/service/approval.go create mode 100644 internal/service/approval_metrics.go create mode 100644 internal/service/approval_test.go diff --git a/internal/service/approval.go b/internal/service/approval.go new file mode 100644 index 0000000..d12a994 --- /dev/null +++ b/internal/service/approval.go @@ -0,0 +1,372 @@ +package service + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/certctl-io/certctl/internal/domain" + "github.com/certctl-io/certctl/internal/repository" +) + +// ApprovalService manages the issuance approval-workflow primitive. +// Rank 7 of the 2026-05-03 Infisical deep-research deliverable. +// +// Lifecycle: a profile with RequiresApproval=true causes the renewal +// entry points (TriggerRenewal + CheckExpiringCertificates) to call +// RequestApproval; the resulting Job is created at +// JobStatusAwaitingApproval; the scheduler does NOT dispatch until +// Approve transitions the job to Pending. +// +// RBAC contract: the requester cannot approve their own request. +// Approve checks decidedBy != request.RequestedBy and rejects with +// ErrApproveBySameActor otherwise. This is the load-bearing two- +// person integrity check; compliance auditors pattern-match against +// it. +// +// Bypass mode: if CERTCTL_APPROVAL_BYPASS=true at boot, every +// RequestApproval call immediately auto-approves with +// decidedBy="system-bypass". Used by dev / CI to keep renewal- +// scheduler tests fast without standing up an approver. Production +// deploys MUST leave this unset; the bypass emits an audit row with +// ActorType=System so a downstream auditor can grep for +// "system-bypass" approvals and confirm none happened in production. +type ApprovalService struct { + approvalRepo repository.ApprovalRepository + jobRepo JobStatusUpdater + auditService *AuditService + metrics *ApprovalMetrics + + bypassEnabled bool +} + +// JobStatusUpdater is the narrow interface ApprovalService depends on +// from JobRepository. Accepting the small interface (rather than the +// full repository.JobRepository) keeps the test mock surface tiny — +// real JobRepository implementations (postgres + any future) satisfy +// it implicitly because they implement UpdateStatus already. +type JobStatusUpdater interface { + UpdateStatus(ctx context.Context, id string, status domain.JobStatus, errMsg string) error +} + +// NewApprovalService constructs an ApprovalService. metrics may be nil +// for tests that don't need Prometheus integration; auditService should +// not be nil in production but is tolerated for unit tests that don't +// care about audit-row emission. +func NewApprovalService( + approvalRepo repository.ApprovalRepository, + jobRepo JobStatusUpdater, + auditService *AuditService, + metrics *ApprovalMetrics, + bypassEnabled bool, +) *ApprovalService { + return &ApprovalService{ + approvalRepo: approvalRepo, + jobRepo: jobRepo, + auditService: auditService, + metrics: metrics, + bypassEnabled: bypassEnabled, + } +} + +// Sentinels for handler-side dispatch via errors.Is. +var ( + // ErrApprovalNotFound is returned when the request ID does not exist. + // Handlers map to HTTP 404. + ErrApprovalNotFound = errors.New("approval request not found") + + // ErrApprovalAlreadyDecided is returned when Approve / Reject is called + // on a request whose State is already terminal. Handlers map to HTTP 409. + ErrApprovalAlreadyDecided = errors.New("approval request already decided") + + // ErrApproveBySameActor is the load-bearing two-person integrity check. + // Returned when the supplied decidedBy equals request.RequestedBy. + // Handlers map to HTTP 403. + ErrApproveBySameActor = errors.New("approver cannot be the same as requester (two-person integrity)") +) + +// RequestApproval creates a pending ApprovalRequest row and is invoked +// from the renewal entry points after they have created the Job at +// Status=AwaitingApproval. Returns the request ID for handler / +// caller use. +// +// If bypassEnabled is true, this method synchronously calls Approve +// internally with decidedBy=ApprovalActorSystemBypass and returns the +// resulting (now-approved) request ID. The audit row records +// ActorType=System so a downstream auditor can confirm bypass-mode +// was off in production via a single SQL query. +func (s *ApprovalService) RequestApproval( + ctx context.Context, + cert *domain.ManagedCertificate, + jobID, profileID, requestedBy string, + metadata map[string]string, +) (string, error) { + if cert == nil { + return "", fmt.Errorf("approval: nil certificate") + } + if jobID == "" || profileID == "" || requestedBy == "" { + return "", fmt.Errorf("approval: jobID, profileID, requestedBy required") + } + + now := time.Now().UTC() + req := &domain.ApprovalRequest{ + CertificateID: cert.ID, + JobID: jobID, + ProfileID: profileID, + RequestedBy: requestedBy, + State: domain.ApprovalStatePending, + Metadata: metadata, + CreatedAt: now, + UpdatedAt: now, + } + if err := s.approvalRepo.Create(ctx, req); err != nil { + return "", fmt.Errorf("approval: create request: %w", err) + } + + // Audit the request creation. Bypass-mode logs both the request and + // the auto-approval as separate rows so the timeline is honest. + s.recordAudit(ctx, requestedBy, domain.ActorTypeUser, "approval_requested", req, nil) + + if s.bypassEnabled { + if err := s.approveInternal(ctx, req.ID, domain.ApprovalActorSystemBypass, + "auto-approved by CERTCTL_APPROVAL_BYPASS — dev/CI mode", + domain.ApprovalOutcomeBypassed, domain.ActorTypeSystem); err != nil { + return req.ID, fmt.Errorf("approval: bypass auto-approve: %w", err) + } + } + + return req.ID, nil +} + +// Approve transitions a pending request to approved AND the linked Job +// from AwaitingApproval to Pending so the job processor picks it up. +// RBAC: rejects if decidedBy == request.RequestedBy. +func (s *ApprovalService) Approve(ctx context.Context, requestID, decidedBy, note string) error { + req, err := s.approvalRepo.Get(ctx, requestID) + if err != nil { + if errors.Is(err, repository.ErrNotFound) { + return ErrApprovalNotFound + } + return fmt.Errorf("approval: get for approve: %w", err) + } + if req.State.IsTerminal() { + return ErrApprovalAlreadyDecided + } + if decidedBy == req.RequestedBy { + return ErrApproveBySameActor + } + return s.approveInternal(ctx, requestID, decidedBy, note, + domain.ApprovalOutcomeApproved, domain.ActorTypeUser) +} + +// approveInternal is the shared transition path for both human-Approve +// and bypass-mode auto-approve. Same DB transition + audit + metric +// recording, but the outcome label + actorType differ. +func (s *ApprovalService) approveInternal( + ctx context.Context, requestID, decidedBy, note, outcome string, + actorType domain.ActorType, +) error { + now := time.Now().UTC() + + // Re-fetch the request after the state-transition guards in Approve so + // we can stamp the metric's pending-age + transition the job. For the + // bypass path, this is the first read. + req, err := s.approvalRepo.Get(ctx, requestID) + if err != nil { + if errors.Is(err, repository.ErrNotFound) { + return ErrApprovalNotFound + } + return fmt.Errorf("approval: get for transition: %w", err) + } + if req.State.IsTerminal() { + return ErrApprovalAlreadyDecided + } + + if err := s.approvalRepo.UpdateState(ctx, requestID, + domain.ApprovalStateApproved, decidedBy, now, note); err != nil { + if errors.Is(err, repository.ErrNotFound) { + return ErrApprovalNotFound + } + if errors.Is(err, repository.ErrAlreadyExists) { + return ErrApprovalAlreadyDecided + } + return fmt.Errorf("approval: update state to approved: %w", err) + } + + // Transition the linked Job from AwaitingApproval to Pending so the + // scheduler picks it up. Best-effort — if the Job has already been + // cancelled or otherwise mutated externally, log via audit and move on. + if err := s.jobRepo.UpdateStatus(ctx, req.JobID, domain.JobStatusPending, ""); err != nil { + s.recordAudit(ctx, decidedBy, actorType, "approval_job_transition_failed", req, + map[string]interface{}{"target_status": string(domain.JobStatusPending), "error": err.Error()}) + return fmt.Errorf("approval: transition job to Pending: %w", err) + } + + s.recordAudit(ctx, decidedBy, actorType, "approval_"+outcome, req, + map[string]interface{}{"note": note, "outcome": outcome}) + if s.metrics != nil { + s.metrics.RecordDecision(outcome, req.ProfileID) + s.metrics.ObservePendingAge(now.Sub(req.CreatedAt).Seconds()) + } + return nil +} + +// Reject transitions a pending request to rejected AND the linked Job +// from AwaitingApproval to Cancelled. RBAC: same-actor check applies. +func (s *ApprovalService) Reject(ctx context.Context, requestID, decidedBy, note string) error { + req, err := s.approvalRepo.Get(ctx, requestID) + if err != nil { + if errors.Is(err, repository.ErrNotFound) { + return ErrApprovalNotFound + } + return fmt.Errorf("approval: get for reject: %w", err) + } + if req.State.IsTerminal() { + return ErrApprovalAlreadyDecided + } + if decidedBy == req.RequestedBy { + return ErrApproveBySameActor + } + + now := time.Now().UTC() + if err := s.approvalRepo.UpdateState(ctx, requestID, + domain.ApprovalStateRejected, decidedBy, now, note); err != nil { + if errors.Is(err, repository.ErrNotFound) { + return ErrApprovalNotFound + } + if errors.Is(err, repository.ErrAlreadyExists) { + return ErrApprovalAlreadyDecided + } + return fmt.Errorf("approval: update state to rejected: %w", err) + } + + if err := s.jobRepo.UpdateStatus(ctx, req.JobID, domain.JobStatusCancelled, + "approval rejected: "+note); err != nil { + s.recordAudit(ctx, decidedBy, domain.ActorTypeUser, "approval_job_transition_failed", req, + map[string]interface{}{"target_status": string(domain.JobStatusCancelled), "error": err.Error()}) + return fmt.Errorf("approval: transition job to Cancelled: %w", err) + } + + s.recordAudit(ctx, decidedBy, domain.ActorTypeUser, "approval_rejected", req, + map[string]interface{}{"note": note, "outcome": domain.ApprovalOutcomeRejected}) + if s.metrics != nil { + s.metrics.RecordDecision(domain.ApprovalOutcomeRejected, req.ProfileID) + s.metrics.ObservePendingAge(now.Sub(req.CreatedAt).Seconds()) + } + return nil +} + +// ListPending returns approval requests in state=pending, paginated. +// Operators reading the dashboard call this on every page load. +func (s *ApprovalService) ListPending(ctx context.Context, page, perPage int) ([]*domain.ApprovalRequest, error) { + return s.approvalRepo.List(ctx, &repository.ApprovalFilter{ + State: string(domain.ApprovalStatePending), + Page: page, + PerPage: perPage, + }) +} + +// List returns approval requests filtered by the supplied filter. Used +// by handler GET /api/v1/approvals with arbitrary state. +func (s *ApprovalService) List(ctx context.Context, filter *repository.ApprovalFilter) ([]*domain.ApprovalRequest, error) { + return s.approvalRepo.List(ctx, filter) +} + +// Get returns a single approval request by ID, or ErrApprovalNotFound. +func (s *ApprovalService) Get(ctx context.Context, id string) (*domain.ApprovalRequest, error) { + req, err := s.approvalRepo.Get(ctx, id) + if err != nil { + if errors.Is(err, repository.ErrNotFound) { + return nil, ErrApprovalNotFound + } + return nil, err + } + return req, nil +} + +// ExpireStale runs from the scheduler's reaper loop. Calls the +// repository's ExpireStale (bulk pending→expired transition) + +// transitions matching jobs from AwaitingApproval to Cancelled. +// Records one audit row per expiry. Returns the count expired. +// +// Operators alert when this is non-zero — it means an approval +// request timed out without human review. +func (s *ApprovalService) ExpireStale(ctx context.Context, before time.Time) (int, error) { + // Find pending requests older than `before` so we can record the + // audit + metric per expiry. ExpireStale on the repo bulk-mutates + // the rows; we read first to capture the per-row metadata for + // auditing, then call the repo's bulk update. + pending, err := s.approvalRepo.List(ctx, &repository.ApprovalFilter{ + State: string(domain.ApprovalStatePending), + PerPage: 500, + }) + if err != nil { + return 0, fmt.Errorf("approval: list pending for expiry: %w", err) + } + + var stale []*domain.ApprovalRequest + for _, req := range pending { + if req.CreatedAt.Before(before) || req.CreatedAt.Equal(before) { + stale = append(stale, req) + } + } + if len(stale) == 0 { + return 0, nil + } + + count, err := s.approvalRepo.ExpireStale(ctx, before) + if err != nil { + return 0, fmt.Errorf("approval: bulk expire: %w", err) + } + + now := time.Now().UTC() + for _, req := range stale { + // Cancel the linked job — best-effort. The scheduler's existing + // ReapTimedOutJobs already handles AwaitingApproval timeouts on + // the job side; this is a defensive double-cancel that's + // idempotent if the scheduler already ran. + if err := s.jobRepo.UpdateStatus(ctx, req.JobID, domain.JobStatusCancelled, + "approval expired: timed out without review"); err != nil { + // Log via audit and continue — don't fail the whole sweep on + // one bad job. + s.recordAudit(ctx, "system-reaper", domain.ActorTypeSystem, "approval_job_transition_failed", req, + map[string]interface{}{"target_status": string(domain.JobStatusCancelled), "error": err.Error()}) + } + + s.recordAudit(ctx, "system-reaper", domain.ActorTypeSystem, "approval_expired", req, + map[string]interface{}{"outcome": domain.ApprovalOutcomeExpired, "before_cutoff": before.Format(time.RFC3339)}) + if s.metrics != nil { + s.metrics.RecordDecision(domain.ApprovalOutcomeExpired, req.ProfileID) + s.metrics.ObservePendingAge(now.Sub(req.CreatedAt).Seconds()) + } + } + + return count, nil +} + +// recordAudit is the shared audit-emission helper. Tolerates a nil +// AuditService (unit tests that don't wire it) and discards errors — +// audit failures must not block the primary state transition. +func (s *ApprovalService) recordAudit(ctx context.Context, actor string, actorType domain.ActorType, + action string, req *domain.ApprovalRequest, extra map[string]interface{}) { + if s.auditService == nil || req == nil { + return + } + details := map[string]interface{}{ + "approval_id": req.ID, + "certificate_id": req.CertificateID, + "job_id": req.JobID, + "profile_id": req.ProfileID, + "requested_by": req.RequestedBy, + "state": string(req.State), + } + for k, v := range req.Metadata { + details["metadata_"+k] = v + } + for k, v := range extra { + details[k] = v + } + _ = s.auditService.RecordEvent(ctx, actor, actorType, action, + "approval_request", req.ID, details) +} diff --git a/internal/service/approval_metrics.go b/internal/service/approval_metrics.go new file mode 100644 index 0000000..996766f --- /dev/null +++ b/internal/service/approval_metrics.go @@ -0,0 +1,212 @@ +package service + +import ( + "math" + "sort" + "sync" + "sync/atomic" +) + +// ApprovalMetrics is a thread-safe counter table for the issuance +// approval-workflow dispatch path. Rank 7 of the 2026-05-03 Infisical +// deep-research deliverable. Mirrors the ExpiryAlertMetrics + +// VaultRenewalMetrics shape: cmd/server/main.go constructs ONE instance, +// passes it to ApprovalService (recording side) AND metricsHandler +// (exposing side) so the snapshotter is the single source of truth. +// +// Dimensions: +// +// outcome — closed enum from internal/domain/approval.go: +// "approved" — Approve transitioned a pending request. +// "rejected" — Reject transitioned a pending request. +// "expired" — scheduler reaper transitioned a stale +// pending request via ExpireStale. +// "bypassed" — CERTCTL_APPROVAL_BYPASS=true short- +// circuited RequestApproval. Production +// deploys MUST have zero rows of this +// outcome. +// profile_id — CertificateProfile.ID that drove the gate. Bounded +// cardinality (operators have <100 profiles in production). +// +// Cardinality bound: 4 outcomes × N profiles. With N=100, that's 400 +// series — well within Prometheus's per-target series budget for a +// well-bounded label. +// +// Pending-age histogram: ObservePendingAge records the seconds-since- +// creation of a pending approval at the moment of decision. Operators +// alert when the p99 hits hours/days (compliance has a deadline). +// Bucket boundaries: 60, 300, 1800, 3600, 21600, 86400, +Inf — 1 +// minute, 5 minutes, 30 minutes, 1 hour, 6 hours, 24 hours, beyond. +type ApprovalMetrics struct { + mu sync.RWMutex + counters map[approvalKey]*atomic.Uint64 + + pendingAgeHist *approvalDurationHistogram +} + +type approvalKey struct { + Outcome string + ProfileID string +} + +// NewApprovalMetrics returns a zero-value ApprovalMetrics ready for +// concurrent use. The caller MUST register the same instance on both +// the ApprovalService (recording) and the MetricsHandler (exposing) +// sides. +func NewApprovalMetrics() *ApprovalMetrics { + return &ApprovalMetrics{ + counters: make(map[approvalKey]*atomic.Uint64), + pendingAgeHist: newApprovalDurationHistogram(), + } +} + +// RecordDecision bumps the (outcome, profile_id) counter by one. Called +// from ApprovalService.Approve / Reject / ExpireStale and from the +// bypass-mode short-circuit inside RequestApproval. +func (m *ApprovalMetrics) RecordDecision(outcome, profileID string) { + if m == nil { + return + } + key := approvalKey{Outcome: outcome, ProfileID: profileID} + + m.mu.RLock() + c, ok := m.counters[key] + m.mu.RUnlock() + + if !ok { + m.mu.Lock() + c, ok = m.counters[key] + if !ok { + c = &atomic.Uint64{} + m.counters[key] = c + } + m.mu.Unlock() + } + c.Add(1) +} + +// ObservePendingAge records the seconds-since-creation of a pending +// approval at the moment of decision (Approve / Reject / Expire). +func (m *ApprovalMetrics) ObservePendingAge(seconds float64) { + if m == nil { + return + } + m.pendingAgeHist.observe(seconds) +} + +// SnapshotApprovalDecisions returns the current decision counter table +// as a sorted slice for deterministic Prometheus exposition. Sort key +// is (outcome, profile_id). +type ApprovalDecisionEntry struct { + Outcome string + ProfileID string + Count uint64 +} + +func (m *ApprovalMetrics) SnapshotApprovalDecisions() []ApprovalDecisionEntry { + if m == nil { + return nil + } + m.mu.RLock() + out := make([]ApprovalDecisionEntry, 0, len(m.counters)) + for k, c := range m.counters { + out = append(out, ApprovalDecisionEntry{ + Outcome: k.Outcome, + ProfileID: k.ProfileID, + Count: c.Load(), + }) + } + m.mu.RUnlock() + + sort.Slice(out, func(i, j int) bool { + if out[i].Outcome != out[j].Outcome { + return out[i].Outcome < out[j].Outcome + } + return out[i].ProfileID < out[j].ProfileID + }) + return out +} + +// SnapshotApprovalPendingAgeHistogram returns the current bucket counts +// + sum + total count for the pending-age histogram. Format suits the +// Prometheus histogram exposition (le buckets + _sum + _count). +type ApprovalPendingAgeSnapshot struct { + BucketBounds []float64 // [60, 300, 1800, 3600, 21600, 86400] — exclusive of +Inf + BucketCounts []uint64 // cumulative counts per bucket; len = len(BucketBounds) + 1 (last is +Inf) + Sum float64 + Count uint64 +} + +func (m *ApprovalMetrics) SnapshotApprovalPendingAgeHistogram() ApprovalPendingAgeSnapshot { + if m == nil { + return ApprovalPendingAgeSnapshot{} + } + return m.pendingAgeHist.snapshot() +} + +// approvalDurationHistogram is a tiny lock-free histogram with fixed +// bucket boundaries for approval-pending-age. Atomic counters per +// bucket + sum stored as uint64-bits-of-float64 atomic. +type approvalDurationHistogram struct { + bounds []float64 + buckets []*atomic.Uint64 // len = len(bounds) + 1; last is +Inf + sumBits *atomic.Uint64 // float64 bits stored atomically + count *atomic.Uint64 +} + +func newApprovalDurationHistogram() *approvalDurationHistogram { + bounds := []float64{60, 300, 1800, 3600, 21600, 86400} + buckets := make([]*atomic.Uint64, len(bounds)+1) + for i := range buckets { + buckets[i] = &atomic.Uint64{} + } + return &approvalDurationHistogram{ + bounds: bounds, + buckets: buckets, + sumBits: &atomic.Uint64{}, + count: &atomic.Uint64{}, + } +} + +func (h *approvalDurationHistogram) observe(seconds float64) { + if h == nil { + return + } + // Find the first bucket whose bound is >= seconds. + idx := len(h.bounds) // default to +Inf bucket + for i, b := range h.bounds { + if seconds <= b { + idx = i + break + } + } + h.buckets[idx].Add(1) + h.count.Add(1) + // Atomic float64 add via CAS loop. + for { + oldBits := h.sumBits.Load() + old := math.Float64frombits(oldBits) + newBits := math.Float64bits(old + seconds) + if h.sumBits.CompareAndSwap(oldBits, newBits) { + return + } + } +} + +func (h *approvalDurationHistogram) snapshot() ApprovalPendingAgeSnapshot { + if h == nil { + return ApprovalPendingAgeSnapshot{} + } + counts := make([]uint64, len(h.buckets)) + cumulative := uint64(0) + for i, b := range h.buckets { + cumulative += b.Load() + counts[i] = cumulative + } + return ApprovalPendingAgeSnapshot{ + BucketBounds: append([]float64(nil), h.bounds...), + BucketCounts: counts, + Sum: math.Float64frombits(h.sumBits.Load()), + Count: h.count.Load(), + } +} diff --git a/internal/service/approval_test.go b/internal/service/approval_test.go new file mode 100644 index 0000000..2f18159 --- /dev/null +++ b/internal/service/approval_test.go @@ -0,0 +1,386 @@ +package service + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "github.com/certctl-io/certctl/internal/domain" + "github.com/certctl-io/certctl/internal/repository" +) + +// fakeApprovalRepo is a minimal in-memory ApprovalRepository for unit +// testing the service-layer logic in isolation. Stores rows in a map +// keyed by ID; List returns rows matching a single state filter. +type fakeApprovalRepo struct { + mu sync.Mutex + rows map[string]*domain.ApprovalRequest +} + +func newFakeApprovalRepo() *fakeApprovalRepo { + return &fakeApprovalRepo{rows: make(map[string]*domain.ApprovalRequest)} +} + +func (f *fakeApprovalRepo) Create(ctx context.Context, req *domain.ApprovalRequest) error { + f.mu.Lock() + defer f.mu.Unlock() + if req.ID == "" { + req.ID = "ar-fake-" + time.Now().Format("150405.000000000") + } + // Enforce the partial-unique pending-per-job at the mock layer too. + for _, existing := range f.rows { + if existing.JobID == req.JobID && existing.State == domain.ApprovalStatePending { + return repository.ErrAlreadyExists + } + } + cp := *req + f.rows[req.ID] = &cp + return nil +} + +func (f *fakeApprovalRepo) Get(ctx context.Context, id string) (*domain.ApprovalRequest, error) { + f.mu.Lock() + defer f.mu.Unlock() + if r, ok := f.rows[id]; ok { + cp := *r + return &cp, nil + } + return nil, repository.ErrNotFound +} + +func (f *fakeApprovalRepo) GetByJobID(ctx context.Context, jobID string) (*domain.ApprovalRequest, error) { + f.mu.Lock() + defer f.mu.Unlock() + for _, r := range f.rows { + if r.JobID == jobID { + cp := *r + return &cp, nil + } + } + return nil, repository.ErrNotFound +} + +func (f *fakeApprovalRepo) List(ctx context.Context, filter *repository.ApprovalFilter) ([]*domain.ApprovalRequest, error) { + f.mu.Lock() + defer f.mu.Unlock() + var out []*domain.ApprovalRequest + for _, r := range f.rows { + if filter != nil && filter.State != "" && string(r.State) != filter.State { + continue + } + if filter != nil && filter.CertificateID != "" && r.CertificateID != filter.CertificateID { + continue + } + if filter != nil && filter.RequestedBy != "" && r.RequestedBy != filter.RequestedBy { + continue + } + cp := *r + out = append(out, &cp) + } + return out, nil +} + +func (f *fakeApprovalRepo) UpdateState(ctx context.Context, id string, state domain.ApprovalState, + decidedBy string, decidedAt time.Time, note string) error { + f.mu.Lock() + defer f.mu.Unlock() + r, ok := f.rows[id] + if !ok { + return repository.ErrNotFound + } + if r.State != domain.ApprovalStatePending { + return repository.ErrAlreadyExists // signals "already terminal" + } + r.State = state + r.DecidedBy = &decidedBy + r.DecidedAt = &decidedAt + if note != "" { + n := note + r.DecisionNote = &n + } + r.UpdatedAt = decidedAt + return nil +} + +func (f *fakeApprovalRepo) ExpireStale(ctx context.Context, before time.Time) (int, error) { + f.mu.Lock() + defer f.mu.Unlock() + now := time.Now().UTC() + count := 0 + for _, r := range f.rows { + if r.State == domain.ApprovalStatePending && (r.CreatedAt.Before(before) || r.CreatedAt.Equal(before)) { + r.State = domain.ApprovalStateExpired + s := "system-reaper" + r.DecidedBy = &s + r.DecidedAt = &now + r.UpdatedAt = now + count++ + } + } + return count, nil +} + +// fakeJobStateRepo implements service.JobStatusUpdater and tracks per-job +// status mutations so the tests can introspect them. It does NOT implement +// the full repository.JobRepository — ApprovalService only needs UpdateStatus. +type fakeJobStateRepo struct { + mu sync.Mutex + statuses map[string]domain.JobStatus +} + +func newFakeJobStateRepo() *fakeJobStateRepo { + return &fakeJobStateRepo{statuses: make(map[string]domain.JobStatus)} +} + +func (f *fakeJobStateRepo) seed(id string, status domain.JobStatus) { + f.mu.Lock() + defer f.mu.Unlock() + f.statuses[id] = status +} + +func (f *fakeJobStateRepo) status(id string) domain.JobStatus { + f.mu.Lock() + defer f.mu.Unlock() + return f.statuses[id] +} + +func (f *fakeJobStateRepo) UpdateStatus(ctx context.Context, id string, status domain.JobStatus, errMsg string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.statuses[id] = status + return nil +} + +// helper builders -------------------------------------------------------- + +func newApprovalSvcForTest(bypass bool) (*ApprovalService, *fakeApprovalRepo, *fakeJobStateRepo) { + ar := newFakeApprovalRepo() + jr := newFakeJobStateRepo() + metrics := NewApprovalMetrics() + svc := NewApprovalService(ar, jr, nil, metrics, bypass) + return svc, ar, jr +} + +func sampleCert() *domain.ManagedCertificate { + return &domain.ManagedCertificate{ID: "mc-test-cert"} +} + +// tests ------------------------------------------------------------------ + +func TestApproval_RequestCreatesPendingRow_BypassDisabled(t *testing.T) { + svc, ar, jr := newApprovalSvcForTest(false) + jr.seed("job-1", domain.JobStatusAwaitingApproval) + + id, err := svc.RequestApproval(context.Background(), sampleCert(), + "job-1", "profile-prod-cdn", "user-alice", map[string]string{"common_name": "api.example.com"}) + if err != nil { + t.Fatalf("RequestApproval err: %v", err) + } + got, err := ar.Get(context.Background(), id) + if err != nil { + t.Fatalf("Get err: %v", err) + } + if got.State != domain.ApprovalStatePending { + t.Fatalf("expected state=pending, got %s", got.State) + } + if got.RequestedBy != "user-alice" { + t.Fatalf("requested_by mismatch: %s", got.RequestedBy) + } + if jr.status("job-1") != domain.JobStatusAwaitingApproval { + t.Fatalf("job should remain AwaitingApproval; got %s", jr.status("job-1")) + } +} + +func TestApproval_BypassMode_AutoApprovesWithSystemBypassActor(t *testing.T) { + svc, ar, jr := newApprovalSvcForTest(true) + jr.seed("job-2", domain.JobStatusAwaitingApproval) + + id, err := svc.RequestApproval(context.Background(), sampleCert(), + "job-2", "profile-iot", "user-bob", nil) + if err != nil { + t.Fatalf("bypass RequestApproval err: %v", err) + } + got, err := ar.Get(context.Background(), id) + if err != nil { + t.Fatalf("Get err: %v", err) + } + if got.State != domain.ApprovalStateApproved { + t.Fatalf("bypass should auto-approve; got state=%s", got.State) + } + if got.DecidedBy == nil || *got.DecidedBy != domain.ApprovalActorSystemBypass { + t.Fatalf("bypass should stamp decided_by=%s; got %v", + domain.ApprovalActorSystemBypass, got.DecidedBy) + } + if jr.status("job-2") != domain.JobStatusPending { + t.Fatalf("bypass should transition job to Pending; got %s", jr.status("job-2")) + } +} + +func TestApproval_Approve_TransitionsJobFromAwaitingApprovalToPending(t *testing.T) { + svc, ar, jr := newApprovalSvcForTest(false) + jr.seed("job-3", domain.JobStatusAwaitingApproval) + id, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-3", "p1", "user-alice", nil) + + if err := svc.Approve(context.Background(), id, "user-bob", "approved per ticket SECOPS-123"); err != nil { + t.Fatalf("Approve err: %v", err) + } + got, _ := ar.Get(context.Background(), id) + if got.State != domain.ApprovalStateApproved { + t.Fatalf("expected state=approved; got %s", got.State) + } + if jr.status("job-3") != domain.JobStatusPending { + t.Fatalf("expected job=Pending; got %s", jr.status("job-3")) + } +} + +func TestApproval_Reject_TransitionsJobFromAwaitingApprovalToCancelled(t *testing.T) { + svc, ar, jr := newApprovalSvcForTest(false) + jr.seed("job-4", domain.JobStatusAwaitingApproval) + id, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-4", "p1", "user-alice", nil) + + if err := svc.Reject(context.Background(), id, "user-bob", "not on the approved-domains list"); err != nil { + t.Fatalf("Reject err: %v", err) + } + got, _ := ar.Get(context.Background(), id) + if got.State != domain.ApprovalStateRejected { + t.Fatalf("expected state=rejected; got %s", got.State) + } + if jr.status("job-4") != domain.JobStatusCancelled { + t.Fatalf("expected job=Cancelled; got %s", jr.status("job-4")) + } +} + +func TestApproval_Approve_RejectsSameActor(t *testing.T) { + // LOAD-BEARING TWO-PERSON INTEGRITY TEST. PCI-DSS 6.4.5 / NIST 800-53 + // SA-15 / SOC 2 CC6.1 compliance auditors pattern-match against this. + svc, _, jr := newApprovalSvcForTest(false) + jr.seed("job-5", domain.JobStatusAwaitingApproval) + id, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-5", "p1", "user-alice", nil) + + err := svc.Approve(context.Background(), id, "user-alice", "trying to self-approve") + if !errors.Is(err, ErrApproveBySameActor) { + t.Fatalf("expected ErrApproveBySameActor; got %v", err) + } + if jr.status("job-5") != domain.JobStatusAwaitingApproval { + t.Fatalf("job should remain AwaitingApproval; got %s", jr.status("job-5")) + } + + // Approval as a different actor succeeds. + if err := svc.Approve(context.Background(), id, "user-bob", "approved by separate actor"); err != nil { + t.Fatalf("Approve as different actor err: %v", err) + } + if jr.status("job-5") != domain.JobStatusPending { + t.Fatalf("expected job=Pending after bob approve; got %s", jr.status("job-5")) + } + + // Same-actor rejection also fails. + jr.seed("job-5b", domain.JobStatusAwaitingApproval) + id2, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-5b", "p1", "user-charlie", nil) + err2 := svc.Reject(context.Background(), id2, "user-charlie", "self-reject") + if !errors.Is(err2, ErrApproveBySameActor) { + t.Fatalf("expected ErrApproveBySameActor on Reject; got %v", err2) + } +} + +func TestApproval_Approve_RejectsAlreadyDecided(t *testing.T) { + svc, _, jr := newApprovalSvcForTest(false) + jr.seed("job-6", domain.JobStatusAwaitingApproval) + id, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-6", "p1", "user-alice", nil) + if err := svc.Approve(context.Background(), id, "user-bob", ""); err != nil { + t.Fatalf("first Approve err: %v", err) + } + + err := svc.Approve(context.Background(), id, "user-charlie", "second approve") + if !errors.Is(err, ErrApprovalAlreadyDecided) { + t.Fatalf("expected ErrApprovalAlreadyDecided; got %v", err) + } + err2 := svc.Reject(context.Background(), id, "user-charlie", "late reject") + if !errors.Is(err2, ErrApprovalAlreadyDecided) { + t.Fatalf("expected ErrApprovalAlreadyDecided on Reject; got %v", err2) + } +} + +func TestApproval_ExpireStale_TransitionsPendingToExpired_AndCancelsJob(t *testing.T) { + svc, ar, jr := newApprovalSvcForTest(false) + jr.seed("job-7", domain.JobStatusAwaitingApproval) + jr.seed("job-8", domain.JobStatusAwaitingApproval) + id7, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-7", "p1", "user-alice", nil) + id8, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-8", "p1", "user-alice", nil) + + // Backdate one of the requests to before the cutoff. + old := time.Now().Add(-200 * time.Hour).UTC() + ar.mu.Lock() + ar.rows[id7].CreatedAt = old + ar.mu.Unlock() + + cutoff := time.Now().Add(-168 * time.Hour).UTC() + count, err := svc.ExpireStale(context.Background(), cutoff) + if err != nil { + t.Fatalf("ExpireStale err: %v", err) + } + if count != 1 { + t.Fatalf("expected 1 row expired; got %d", count) + } + got7, _ := ar.Get(context.Background(), id7) + if got7.State != domain.ApprovalStateExpired { + t.Fatalf("expected job-7 expired; got %s", got7.State) + } + got8, _ := ar.Get(context.Background(), id8) + if got8.State != domain.ApprovalStatePending { + t.Fatalf("job-8 should still be pending; got %s", got8.State) + } + if jr.status("job-7") != domain.JobStatusCancelled { + t.Fatalf("expected job-7 cancelled; got %s", jr.status("job-7")) + } + if jr.status("job-8") != domain.JobStatusAwaitingApproval { + t.Fatalf("job-8 should remain AwaitingApproval; got %s", jr.status("job-8")) + } +} + +func TestApproval_MetricCounterIncrements(t *testing.T) { + svc, _, jr := newApprovalSvcForTest(false) + metrics := svc.metrics + + jr.seed("job-9", domain.JobStatusAwaitingApproval) + id9, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-9", "p-cdn", "user-alice", nil) + _ = svc.Approve(context.Background(), id9, "user-bob", "approved") + + jr.seed("job-10", domain.JobStatusAwaitingApproval) + id10, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-10", "p-cdn", "user-alice", nil) + _ = svc.Reject(context.Background(), id10, "user-bob", "rejected") + + jr.seed("job-11", domain.JobStatusAwaitingApproval) + id11, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-11", "p-cdn", "user-alice", nil) + // Backdate + expire. + old := time.Now().Add(-200 * time.Hour).UTC() + repo := svc.approvalRepo.(*fakeApprovalRepo) + repo.mu.Lock() + repo.rows[id11].CreatedAt = old + repo.mu.Unlock() + if _, err := svc.ExpireStale(context.Background(), time.Now().Add(-168*time.Hour)); err != nil { + t.Fatalf("ExpireStale err: %v", err) + } + + snap := metrics.SnapshotApprovalDecisions() + got := map[string]uint64{} + for _, e := range snap { + got[e.Outcome] = e.Count + } + if got[domain.ApprovalOutcomeApproved] != 1 { + t.Fatalf("expected 1 approved counter; got %d", got[domain.ApprovalOutcomeApproved]) + } + if got[domain.ApprovalOutcomeRejected] != 1 { + t.Fatalf("expected 1 rejected counter; got %d", got[domain.ApprovalOutcomeRejected]) + } + if got[domain.ApprovalOutcomeExpired] != 1 { + t.Fatalf("expected 1 expired counter; got %d", got[domain.ApprovalOutcomeExpired]) + } + + // Histogram observed at least 3 samples. + hist := metrics.SnapshotApprovalPendingAgeHistogram() + if hist.Count < 3 { + t.Fatalf("expected at least 3 histogram samples; got %d", hist.Count) + } +}