From df23294476effaa32dfc1c94d49d2c70068824ab Mon Sep 17 00:00:00 2001
From: shankar0123 <skreddy040@gmail.com>
Date: Mon, 4 May 2026 01:01:53 +0000
Subject: [PATCH] service: ApprovalService + ApprovalMetrics + 8 table-driven
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rank 7 of the 2026-05-03 Infisical deep-research deliverable, commit 2 of 4
(cowork/rank-7-approval-workflow-primitive-prompt.md). Builds on the
foundation in commit b4d1ad1 — wires the service layer that drives the
approval workflow. Still no handler / integration wiring; commits 3-4
land that.

Files added:
  internal/service/approval.go         - ApprovalService struct + 6
                                          methods: RequestApproval,
                                          Approve, Reject, ListPending,
                                          List, Get, ExpireStale.
                                          Same-actor RBAC check
                                          (ErrApproveBySameActor) at
                                          both Approve and Reject; the
                                          load-bearing two-person
                                          integrity gate. Bypass mode
                                          short-circuits via
                                          approveInternal(outcome=
                                          "bypassed", actorType=System).
                                          Audit + metric emission per
                                          decision via shared
                                          recordAudit helper. Tolerates
                                          nil AuditService for tests.
                                          Service depends on a narrow
                                          JobStatusUpdater interface
                                          (single-method) rather than
                                          the full repository.JobRepository
                                          — production wiring satisfies
                                          it implicitly via postgres'
                                          existing UpdateStatus.

  internal/service/approval_metrics.go - ApprovalMetrics: thread-safe
                                          counter table (decisions
                                          counter dimensioned by
                                          outcome × profile_id) + a
                                          custom durationHistogram for
                                          pending-age (le buckets:
                                          60, 300, 1800, 3600, 21600,
                                          86400, +Inf — 1m, 5m, 30m,
                                          1h, 6h, 24h, beyond).
                                          Snapshot* methods return the
                                          Prometheus exposer's input
                                          shapes. Mirrors the
                                          ExpiryAlertMetrics +
                                          VaultRenewalMetrics pattern
                                          from prior ranks.

  internal/service/approval_test.go    - 8 table-driven tests with
                                          tight in-package fakes
                                          (fakeApprovalRepo +
                                          fakeJobStateRepo):
                                            TestApproval_RequestCreatesPendingRow_BypassDisabled
                                            TestApproval_BypassMode_AutoApprovesWithSystemBypassActor
                                            TestApproval_Approve_TransitionsJobFromAwaitingApprovalToPending
                                            TestApproval_Reject_TransitionsJobFromAwaitingApprovalToCancelled
                                            TestApproval_Approve_RejectsSameActor
                                              ↑ THE LOAD-BEARING TWO-PERSON
                                                INTEGRITY TEST. PCI-DSS 6.4.5
                                                / NIST 800-53 SA-15 / SOC 2
                                                CC6.1 compliance auditors
                                                pattern-match against this.
                                                Pins same-actor rejection on
                                                both Approve and Reject paths;
                                                pins success when a different
                                                actor approves.
                                            TestApproval_Approve_RejectsAlreadyDecided
                                            TestApproval_ExpireStale_TransitionsPendingToExpired_AndCancelsJob
                                            TestApproval_MetricCounterIncrements

Verified:
  gofmt: clean.
  go vet ./internal/service/...: exit 0.
  go test -short -count=1 -run TestApproval ./internal/service/...:
    ok 0.005s — all 8 tests green.

Out of scope for this commit (lands in commits 3-4):
  - api/handler/approval.go (5 endpoints + handler-side RBAC).
  - api/openapi.yaml extensions.
  - Integration into CertificateService.TriggerRenewal +
    RenewalService.CheckExpiringCertificates + Scheduler.ReapTimedOutJobs.
  - cmd/server/main.go wiring of ApprovalService + ApprovalMetrics.
  - Config.Approval.BypassEnabled + CERTCTL_APPROVAL_BYPASS env var.
  - docs/connectors.md row + docs/approval-workflow.md runbook.

Reference: cowork/rank-7-approval-workflow-primitive-prompt.md.
---
 internal/service/approval.go         | 372 ++++++++++++++++++++++++++
 internal/service/approval_metrics.go | 212 +++++++++++++++
 internal/service/approval_test.go    | 386 +++++++++++++++++++++++++++
 3 files changed, 970 insertions(+)
 create mode 100644 internal/service/approval.go
 create mode 100644 internal/service/approval_metrics.go
 create mode 100644 internal/service/approval_test.go

diff --git a/internal/service/approval.go b/internal/service/approval.go
new file mode 100644
index 0000000..d12a994
--- /dev/null
+++ b/internal/service/approval.go
@@ -0,0 +1,372 @@
+package service
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+
+	"github.com/certctl-io/certctl/internal/domain"
+	"github.com/certctl-io/certctl/internal/repository"
+)
+
+// ApprovalService manages the issuance approval-workflow primitive.
+// Rank 7 of the 2026-05-03 Infisical deep-research deliverable.
+//
+// Lifecycle: a profile with RequiresApproval=true causes the renewal
+// entry points (TriggerRenewal + CheckExpiringCertificates) to call
+// RequestApproval; the resulting Job is created at
+// JobStatusAwaitingApproval; the scheduler does NOT dispatch until
+// Approve transitions the job to Pending.
+//
+// RBAC contract: the requester cannot approve their own request.
+// Approve checks decidedBy != request.RequestedBy and rejects with
+// ErrApproveBySameActor otherwise. This is the load-bearing two-
+// person integrity check; compliance auditors pattern-match against
+// it.
+//
+// Bypass mode: if CERTCTL_APPROVAL_BYPASS=true at boot, every
+// RequestApproval call immediately auto-approves with
+// decidedBy="system-bypass". Used by dev / CI to keep renewal-
+// scheduler tests fast without standing up an approver. Production
+// deploys MUST leave this unset; the bypass emits an audit row with
+// ActorType=System so a downstream auditor can grep for
+// "system-bypass" approvals and confirm none happened in production.
+type ApprovalService struct {
+	approvalRepo repository.ApprovalRepository
+	jobRepo      JobStatusUpdater
+	auditService *AuditService
+	metrics      *ApprovalMetrics
+
+	bypassEnabled bool
+}
+
+// JobStatusUpdater is the narrow interface ApprovalService depends on
+// from JobRepository. Accepting the small interface (rather than the
+// full repository.JobRepository) keeps the test mock surface tiny —
+// real JobRepository implementations (postgres + any future) satisfy
+// it implicitly because they implement UpdateStatus already.
+type JobStatusUpdater interface {
+	UpdateStatus(ctx context.Context, id string, status domain.JobStatus, errMsg string) error
+}
+
+// NewApprovalService constructs an ApprovalService. metrics may be nil
+// for tests that don't need Prometheus integration; auditService should
+// not be nil in production but is tolerated for unit tests that don't
+// care about audit-row emission.
+func NewApprovalService(
+	approvalRepo repository.ApprovalRepository,
+	jobRepo JobStatusUpdater,
+	auditService *AuditService,
+	metrics *ApprovalMetrics,
+	bypassEnabled bool,
+) *ApprovalService {
+	return &ApprovalService{
+		approvalRepo:  approvalRepo,
+		jobRepo:       jobRepo,
+		auditService:  auditService,
+		metrics:       metrics,
+		bypassEnabled: bypassEnabled,
+	}
+}
+
+// Sentinels for handler-side dispatch via errors.Is.
+var (
+	// ErrApprovalNotFound is returned when the request ID does not exist.
+	// Handlers map to HTTP 404.
+	ErrApprovalNotFound = errors.New("approval request not found")
+
+	// ErrApprovalAlreadyDecided is returned when Approve / Reject is called
+	// on a request whose State is already terminal. Handlers map to HTTP 409.
+	ErrApprovalAlreadyDecided = errors.New("approval request already decided")
+
+	// ErrApproveBySameActor is the load-bearing two-person integrity check.
+	// Returned when the supplied decidedBy equals request.RequestedBy.
+	// Handlers map to HTTP 403.
+	ErrApproveBySameActor = errors.New("approver cannot be the same as requester (two-person integrity)")
+)
+
+// RequestApproval creates a pending ApprovalRequest row and is invoked
+// from the renewal entry points after they have created the Job at
+// Status=AwaitingApproval. Returns the request ID for handler /
+// caller use.
+//
+// If bypassEnabled is true, this method synchronously calls Approve
+// internally with decidedBy=ApprovalActorSystemBypass and returns the
+// resulting (now-approved) request ID. The audit row records
+// ActorType=System so a downstream auditor can confirm bypass-mode
+// was off in production via a single SQL query.
+func (s *ApprovalService) RequestApproval(
+	ctx context.Context,
+	cert *domain.ManagedCertificate,
+	jobID, profileID, requestedBy string,
+	metadata map[string]string,
+) (string, error) {
+	if cert == nil {
+		return "", fmt.Errorf("approval: nil certificate")
+	}
+	if jobID == "" || profileID == "" || requestedBy == "" {
+		return "", fmt.Errorf("approval: jobID, profileID, requestedBy required")
+	}
+
+	now := time.Now().UTC()
+	req := &domain.ApprovalRequest{
+		CertificateID: cert.ID,
+		JobID:         jobID,
+		ProfileID:     profileID,
+		RequestedBy:   requestedBy,
+		State:         domain.ApprovalStatePending,
+		Metadata:      metadata,
+		CreatedAt:     now,
+		UpdatedAt:     now,
+	}
+	if err := s.approvalRepo.Create(ctx, req); err != nil {
+		return "", fmt.Errorf("approval: create request: %w", err)
+	}
+
+	// Audit the request creation. Bypass-mode logs both the request and
+	// the auto-approval as separate rows so the timeline is honest.
+	s.recordAudit(ctx, requestedBy, domain.ActorTypeUser, "approval_requested", req, nil)
+
+	if s.bypassEnabled {
+		if err := s.approveInternal(ctx, req.ID, domain.ApprovalActorSystemBypass,
+			"auto-approved by CERTCTL_APPROVAL_BYPASS — dev/CI mode",
+			domain.ApprovalOutcomeBypassed, domain.ActorTypeSystem); err != nil {
+			return req.ID, fmt.Errorf("approval: bypass auto-approve: %w", err)
+		}
+	}
+
+	return req.ID, nil
+}
+
+// Approve transitions a pending request to approved AND the linked Job
+// from AwaitingApproval to Pending so the job processor picks it up.
+// RBAC: rejects if decidedBy == request.RequestedBy.
+func (s *ApprovalService) Approve(ctx context.Context, requestID, decidedBy, note string) error {
+	req, err := s.approvalRepo.Get(ctx, requestID)
+	if err != nil {
+		if errors.Is(err, repository.ErrNotFound) {
+			return ErrApprovalNotFound
+		}
+		return fmt.Errorf("approval: get for approve: %w", err)
+	}
+	if req.State.IsTerminal() {
+		return ErrApprovalAlreadyDecided
+	}
+	if decidedBy == req.RequestedBy {
+		return ErrApproveBySameActor
+	}
+	return s.approveInternal(ctx, requestID, decidedBy, note,
+		domain.ApprovalOutcomeApproved, domain.ActorTypeUser)
+}
+
+// approveInternal is the shared transition path for both human-Approve
+// and bypass-mode auto-approve. Same DB transition + audit + metric
+// recording, but the outcome label + actorType differ.
+func (s *ApprovalService) approveInternal(
+	ctx context.Context, requestID, decidedBy, note, outcome string,
+	actorType domain.ActorType,
+) error {
+	now := time.Now().UTC()
+
+	// Re-fetch the request after the state-transition guards in Approve so
+	// we can stamp the metric's pending-age + transition the job. For the
+	// bypass path, this is the first read.
+	req, err := s.approvalRepo.Get(ctx, requestID)
+	if err != nil {
+		if errors.Is(err, repository.ErrNotFound) {
+			return ErrApprovalNotFound
+		}
+		return fmt.Errorf("approval: get for transition: %w", err)
+	}
+	if req.State.IsTerminal() {
+		return ErrApprovalAlreadyDecided
+	}
+
+	if err := s.approvalRepo.UpdateState(ctx, requestID,
+		domain.ApprovalStateApproved, decidedBy, now, note); err != nil {
+		if errors.Is(err, repository.ErrNotFound) {
+			return ErrApprovalNotFound
+		}
+		if errors.Is(err, repository.ErrAlreadyExists) {
+			return ErrApprovalAlreadyDecided
+		}
+		return fmt.Errorf("approval: update state to approved: %w", err)
+	}
+
+	// Transition the linked Job from AwaitingApproval to Pending so the
+	// scheduler picks it up. Best-effort — if the Job has already been
+	// cancelled or otherwise mutated externally, log via audit and move on.
+	if err := s.jobRepo.UpdateStatus(ctx, req.JobID, domain.JobStatusPending, ""); err != nil {
+		s.recordAudit(ctx, decidedBy, actorType, "approval_job_transition_failed", req,
+			map[string]interface{}{"target_status": string(domain.JobStatusPending), "error": err.Error()})
+		return fmt.Errorf("approval: transition job to Pending: %w", err)
+	}
+
+	s.recordAudit(ctx, decidedBy, actorType, "approval_"+outcome, req,
+		map[string]interface{}{"note": note, "outcome": outcome})
+	if s.metrics != nil {
+		s.metrics.RecordDecision(outcome, req.ProfileID)
+		s.metrics.ObservePendingAge(now.Sub(req.CreatedAt).Seconds())
+	}
+	return nil
+}
+
+// Reject transitions a pending request to rejected AND the linked Job
+// from AwaitingApproval to Cancelled. RBAC: same-actor check applies.
+func (s *ApprovalService) Reject(ctx context.Context, requestID, decidedBy, note string) error {
+	req, err := s.approvalRepo.Get(ctx, requestID)
+	if err != nil {
+		if errors.Is(err, repository.ErrNotFound) {
+			return ErrApprovalNotFound
+		}
+		return fmt.Errorf("approval: get for reject: %w", err)
+	}
+	if req.State.IsTerminal() {
+		return ErrApprovalAlreadyDecided
+	}
+	if decidedBy == req.RequestedBy {
+		return ErrApproveBySameActor
+	}
+
+	now := time.Now().UTC()
+	if err := s.approvalRepo.UpdateState(ctx, requestID,
+		domain.ApprovalStateRejected, decidedBy, now, note); err != nil {
+		if errors.Is(err, repository.ErrNotFound) {
+			return ErrApprovalNotFound
+		}
+		if errors.Is(err, repository.ErrAlreadyExists) {
+			return ErrApprovalAlreadyDecided
+		}
+		return fmt.Errorf("approval: update state to rejected: %w", err)
+	}
+
+	if err := s.jobRepo.UpdateStatus(ctx, req.JobID, domain.JobStatusCancelled,
+		"approval rejected: "+note); err != nil {
+		s.recordAudit(ctx, decidedBy, domain.ActorTypeUser, "approval_job_transition_failed", req,
+			map[string]interface{}{"target_status": string(domain.JobStatusCancelled), "error": err.Error()})
+		return fmt.Errorf("approval: transition job to Cancelled: %w", err)
+	}
+
+	s.recordAudit(ctx, decidedBy, domain.ActorTypeUser, "approval_rejected", req,
+		map[string]interface{}{"note": note, "outcome": domain.ApprovalOutcomeRejected})
+	if s.metrics != nil {
+		s.metrics.RecordDecision(domain.ApprovalOutcomeRejected, req.ProfileID)
+		s.metrics.ObservePendingAge(now.Sub(req.CreatedAt).Seconds())
+	}
+	return nil
+}
+
+// ListPending returns approval requests in state=pending, paginated.
+// Operators reading the dashboard call this on every page load.
+func (s *ApprovalService) ListPending(ctx context.Context, page, perPage int) ([]*domain.ApprovalRequest, error) {
+	return s.approvalRepo.List(ctx, &repository.ApprovalFilter{
+		State:   string(domain.ApprovalStatePending),
+		Page:    page,
+		PerPage: perPage,
+	})
+}
+
+// List returns approval requests filtered by the supplied filter. Used
+// by handler GET /api/v1/approvals with arbitrary state.
+func (s *ApprovalService) List(ctx context.Context, filter *repository.ApprovalFilter) ([]*domain.ApprovalRequest, error) {
+	return s.approvalRepo.List(ctx, filter)
+}
+
+// Get returns a single approval request by ID, or ErrApprovalNotFound.
+func (s *ApprovalService) Get(ctx context.Context, id string) (*domain.ApprovalRequest, error) {
+	req, err := s.approvalRepo.Get(ctx, id)
+	if err != nil {
+		if errors.Is(err, repository.ErrNotFound) {
+			return nil, ErrApprovalNotFound
+		}
+		return nil, err
+	}
+	return req, nil
+}
+
+// ExpireStale runs from the scheduler's reaper loop. Calls the
+// repository's ExpireStale (bulk pending→expired transition) +
+// transitions matching jobs from AwaitingApproval to Cancelled.
+// Records one audit row per expiry. Returns the count expired.
+//
+// Operators alert when this is non-zero — it means an approval
+// request timed out without human review.
+func (s *ApprovalService) ExpireStale(ctx context.Context, before time.Time) (int, error) {
+	// Find pending requests older than `before` so we can record the
+	// audit + metric per expiry. ExpireStale on the repo bulk-mutates
+	// the rows; we read first to capture the per-row metadata for
+	// auditing, then call the repo's bulk update.
+	pending, err := s.approvalRepo.List(ctx, &repository.ApprovalFilter{
+		State:   string(domain.ApprovalStatePending),
+		PerPage: 500,
+	})
+	if err != nil {
+		return 0, fmt.Errorf("approval: list pending for expiry: %w", err)
+	}
+
+	var stale []*domain.ApprovalRequest
+	for _, req := range pending {
+		if req.CreatedAt.Before(before) || req.CreatedAt.Equal(before) {
+			stale = append(stale, req)
+		}
+	}
+	if len(stale) == 0 {
+		return 0, nil
+	}
+
+	count, err := s.approvalRepo.ExpireStale(ctx, before)
+	if err != nil {
+		return 0, fmt.Errorf("approval: bulk expire: %w", err)
+	}
+
+	now := time.Now().UTC()
+	for _, req := range stale {
+		// Cancel the linked job — best-effort. The scheduler's existing
+		// ReapTimedOutJobs already handles AwaitingApproval timeouts on
+		// the job side; this is a defensive double-cancel that's
+		// idempotent if the scheduler already ran.
+		if err := s.jobRepo.UpdateStatus(ctx, req.JobID, domain.JobStatusCancelled,
+			"approval expired: timed out without review"); err != nil {
+			// Log via audit and continue — don't fail the whole sweep on
+			// one bad job.
+			s.recordAudit(ctx, "system-reaper", domain.ActorTypeSystem, "approval_job_transition_failed", req,
+				map[string]interface{}{"target_status": string(domain.JobStatusCancelled), "error": err.Error()})
+		}
+
+		s.recordAudit(ctx, "system-reaper", domain.ActorTypeSystem, "approval_expired", req,
+			map[string]interface{}{"outcome": domain.ApprovalOutcomeExpired, "before_cutoff": before.Format(time.RFC3339)})
+		if s.metrics != nil {
+			s.metrics.RecordDecision(domain.ApprovalOutcomeExpired, req.ProfileID)
+			s.metrics.ObservePendingAge(now.Sub(req.CreatedAt).Seconds())
+		}
+	}
+
+	return count, nil
+}
+
+// recordAudit is the shared audit-emission helper. Tolerates a nil
+// AuditService (unit tests that don't wire it) and discards errors —
+// audit failures must not block the primary state transition.
+func (s *ApprovalService) recordAudit(ctx context.Context, actor string, actorType domain.ActorType,
+	action string, req *domain.ApprovalRequest, extra map[string]interface{}) {
+	if s.auditService == nil || req == nil {
+		return
+	}
+	details := map[string]interface{}{
+		"approval_id":    req.ID,
+		"certificate_id": req.CertificateID,
+		"job_id":         req.JobID,
+		"profile_id":     req.ProfileID,
+		"requested_by":   req.RequestedBy,
+		"state":          string(req.State),
+	}
+	for k, v := range req.Metadata {
+		details["metadata_"+k] = v
+	}
+	for k, v := range extra {
+		details[k] = v
+	}
+	_ = s.auditService.RecordEvent(ctx, actor, actorType, action,
+		"approval_request", req.ID, details)
+}
diff --git a/internal/service/approval_metrics.go b/internal/service/approval_metrics.go
new file mode 100644
index 0000000..996766f
--- /dev/null
+++ b/internal/service/approval_metrics.go
@@ -0,0 +1,212 @@
+package service
+
+import (
+	"math"
+	"sort"
+	"sync"
+	"sync/atomic"
+)
+
+// ApprovalMetrics is a thread-safe counter table for the issuance
+// approval-workflow dispatch path. Rank 7 of the 2026-05-03 Infisical
+// deep-research deliverable. Mirrors the ExpiryAlertMetrics +
+// VaultRenewalMetrics shape: cmd/server/main.go constructs ONE instance,
+// passes it to ApprovalService (recording side) AND metricsHandler
+// (exposing side) so the snapshotter is the single source of truth.
+//
+// Dimensions:
+//
+//	outcome    — closed enum from internal/domain/approval.go:
+//	              "approved"  — Approve transitioned a pending request.
+//	              "rejected"  — Reject transitioned a pending request.
+//	              "expired"   — scheduler reaper transitioned a stale
+//	                            pending request via ExpireStale.
+//	              "bypassed"  — CERTCTL_APPROVAL_BYPASS=true short-
+//	                            circuited RequestApproval. Production
+//	                            deploys MUST have zero rows of this
+//	                            outcome.
+//	profile_id — CertificateProfile.ID that drove the gate. Bounded
+//	             cardinality (operators have <100 profiles in production).
+//
+// Cardinality bound: 4 outcomes × N profiles. With N=100, that's 400
+// series — well within Prometheus's per-target series budget for a
+// well-bounded label.
+//
+// Pending-age histogram: ObservePendingAge records the seconds-since-
+// creation of a pending approval at the moment of decision. Operators
+// alert when the p99 hits hours/days (compliance has a deadline).
+// Bucket boundaries: 60, 300, 1800, 3600, 21600, 86400, +Inf — 1
+// minute, 5 minutes, 30 minutes, 1 hour, 6 hours, 24 hours, beyond.
+type ApprovalMetrics struct {
+	mu       sync.RWMutex
+	counters map[approvalKey]*atomic.Uint64
+
+	pendingAgeHist *approvalDurationHistogram
+}
+
+type approvalKey struct {
+	Outcome   string
+	ProfileID string
+}
+
+// NewApprovalMetrics returns a zero-value ApprovalMetrics ready for
+// concurrent use. The caller MUST register the same instance on both
+// the ApprovalService (recording) and the MetricsHandler (exposing)
+// sides.
+func NewApprovalMetrics() *ApprovalMetrics {
+	return &ApprovalMetrics{
+		counters:       make(map[approvalKey]*atomic.Uint64),
+		pendingAgeHist: newApprovalDurationHistogram(),
+	}
+}
+
+// RecordDecision bumps the (outcome, profile_id) counter by one. Called
+// from ApprovalService.Approve / Reject / ExpireStale and from the
+// bypass-mode short-circuit inside RequestApproval.
+func (m *ApprovalMetrics) RecordDecision(outcome, profileID string) {
+	if m == nil {
+		return
+	}
+	key := approvalKey{Outcome: outcome, ProfileID: profileID}
+
+	m.mu.RLock()
+	c, ok := m.counters[key]
+	m.mu.RUnlock()
+
+	if !ok {
+		m.mu.Lock()
+		c, ok = m.counters[key]
+		if !ok {
+			c = &atomic.Uint64{}
+			m.counters[key] = c
+		}
+		m.mu.Unlock()
+	}
+	c.Add(1)
+}
+
+// ObservePendingAge records the seconds-since-creation of a pending
+// approval at the moment of decision (Approve / Reject / Expire).
+func (m *ApprovalMetrics) ObservePendingAge(seconds float64) {
+	if m == nil {
+		return
+	}
+	m.pendingAgeHist.observe(seconds)
+}
+
+// SnapshotApprovalDecisions returns the current decision counter table
+// as a sorted slice for deterministic Prometheus exposition. Sort key
+// is (outcome, profile_id).
+type ApprovalDecisionEntry struct {
+	Outcome   string
+	ProfileID string
+	Count     uint64
+}
+
+func (m *ApprovalMetrics) SnapshotApprovalDecisions() []ApprovalDecisionEntry {
+	if m == nil {
+		return nil
+	}
+	m.mu.RLock()
+	out := make([]ApprovalDecisionEntry, 0, len(m.counters))
+	for k, c := range m.counters {
+		out = append(out, ApprovalDecisionEntry{
+			Outcome:   k.Outcome,
+			ProfileID: k.ProfileID,
+			Count:     c.Load(),
+		})
+	}
+	m.mu.RUnlock()
+
+	sort.Slice(out, func(i, j int) bool {
+		if out[i].Outcome != out[j].Outcome {
+			return out[i].Outcome < out[j].Outcome
+		}
+		return out[i].ProfileID < out[j].ProfileID
+	})
+	return out
+}
+
+// SnapshotApprovalPendingAgeHistogram returns the current bucket counts
+// + sum + total count for the pending-age histogram. Format suits the
+// Prometheus histogram exposition (le buckets + _sum + _count).
+type ApprovalPendingAgeSnapshot struct {
+	BucketBounds []float64 // [60, 300, 1800, 3600, 21600, 86400] — exclusive of +Inf
+	BucketCounts []uint64  // cumulative counts per bucket; len = len(BucketBounds) + 1 (last is +Inf)
+	Sum          float64
+	Count        uint64
+}
+
+func (m *ApprovalMetrics) SnapshotApprovalPendingAgeHistogram() ApprovalPendingAgeSnapshot {
+	if m == nil {
+		return ApprovalPendingAgeSnapshot{}
+	}
+	return m.pendingAgeHist.snapshot()
+}
+
+// approvalDurationHistogram is a tiny lock-free histogram with fixed
+// bucket boundaries for approval-pending-age. Atomic counters per
+// bucket + sum stored as uint64-bits-of-float64 atomic.
+type approvalDurationHistogram struct {
+	bounds  []float64
+	buckets []*atomic.Uint64 // len = len(bounds) + 1; last is +Inf
+	sumBits *atomic.Uint64   // float64 bits stored atomically
+	count   *atomic.Uint64
+}
+
+func newApprovalDurationHistogram() *approvalDurationHistogram {
+	bounds := []float64{60, 300, 1800, 3600, 21600, 86400}
+	buckets := make([]*atomic.Uint64, len(bounds)+1)
+	for i := range buckets {
+		buckets[i] = &atomic.Uint64{}
+	}
+	return &approvalDurationHistogram{
+		bounds:  bounds,
+		buckets: buckets,
+		sumBits: &atomic.Uint64{},
+		count:   &atomic.Uint64{},
+	}
+}
+
+func (h *approvalDurationHistogram) observe(seconds float64) {
+	if h == nil {
+		return
+	}
+	// Find the first bucket whose bound is >= seconds.
+	idx := len(h.bounds) // default to +Inf bucket
+	for i, b := range h.bounds {
+		if seconds <= b {
+			idx = i
+			break
+		}
+	}
+	h.buckets[idx].Add(1)
+	h.count.Add(1)
+	// Atomic float64 add via CAS loop.
+	for {
+		oldBits := h.sumBits.Load()
+		old := math.Float64frombits(oldBits)
+		newBits := math.Float64bits(old + seconds)
+		if h.sumBits.CompareAndSwap(oldBits, newBits) {
+			return
+		}
+	}
+}
+
+func (h *approvalDurationHistogram) snapshot() ApprovalPendingAgeSnapshot {
+	if h == nil {
+		return ApprovalPendingAgeSnapshot{}
+	}
+	counts := make([]uint64, len(h.buckets))
+	cumulative := uint64(0)
+	for i, b := range h.buckets {
+		cumulative += b.Load()
+		counts[i] = cumulative
+	}
+	return ApprovalPendingAgeSnapshot{
+		BucketBounds: append([]float64(nil), h.bounds...),
+		BucketCounts: counts,
+		Sum:          math.Float64frombits(h.sumBits.Load()),
+		Count:        h.count.Load(),
+	}
+}
diff --git a/internal/service/approval_test.go b/internal/service/approval_test.go
new file mode 100644
index 0000000..2f18159
--- /dev/null
+++ b/internal/service/approval_test.go
@@ -0,0 +1,386 @@
+package service
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/certctl-io/certctl/internal/domain"
+	"github.com/certctl-io/certctl/internal/repository"
+)
+
+// fakeApprovalRepo is a minimal in-memory ApprovalRepository for unit
+// testing the service-layer logic in isolation. Stores rows in a map
+// keyed by ID; List returns rows matching a single state filter.
+type fakeApprovalRepo struct {
+	mu   sync.Mutex
+	rows map[string]*domain.ApprovalRequest
+}
+
+func newFakeApprovalRepo() *fakeApprovalRepo {
+	return &fakeApprovalRepo{rows: make(map[string]*domain.ApprovalRequest)}
+}
+
+func (f *fakeApprovalRepo) Create(ctx context.Context, req *domain.ApprovalRequest) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if req.ID == "" {
+		req.ID = "ar-fake-" + time.Now().Format("150405.000000000")
+	}
+	// Enforce the partial-unique pending-per-job at the mock layer too.
+	for _, existing := range f.rows {
+		if existing.JobID == req.JobID && existing.State == domain.ApprovalStatePending {
+			return repository.ErrAlreadyExists
+		}
+	}
+	cp := *req
+	f.rows[req.ID] = &cp
+	return nil
+}
+
+func (f *fakeApprovalRepo) Get(ctx context.Context, id string) (*domain.ApprovalRequest, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if r, ok := f.rows[id]; ok {
+		cp := *r
+		return &cp, nil
+	}
+	return nil, repository.ErrNotFound
+}
+
+func (f *fakeApprovalRepo) GetByJobID(ctx context.Context, jobID string) (*domain.ApprovalRequest, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for _, r := range f.rows {
+		if r.JobID == jobID {
+			cp := *r
+			return &cp, nil
+		}
+	}
+	return nil, repository.ErrNotFound
+}
+
+func (f *fakeApprovalRepo) List(ctx context.Context, filter *repository.ApprovalFilter) ([]*domain.ApprovalRequest, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	var out []*domain.ApprovalRequest
+	for _, r := range f.rows {
+		if filter != nil && filter.State != "" && string(r.State) != filter.State {
+			continue
+		}
+		if filter != nil && filter.CertificateID != "" && r.CertificateID != filter.CertificateID {
+			continue
+		}
+		if filter != nil && filter.RequestedBy != "" && r.RequestedBy != filter.RequestedBy {
+			continue
+		}
+		cp := *r
+		out = append(out, &cp)
+	}
+	return out, nil
+}
+
+func (f *fakeApprovalRepo) UpdateState(ctx context.Context, id string, state domain.ApprovalState,
+	decidedBy string, decidedAt time.Time, note string) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	r, ok := f.rows[id]
+	if !ok {
+		return repository.ErrNotFound
+	}
+	if r.State != domain.ApprovalStatePending {
+		return repository.ErrAlreadyExists // signals "already terminal"
+	}
+	r.State = state
+	r.DecidedBy = &decidedBy
+	r.DecidedAt = &decidedAt
+	if note != "" {
+		n := note
+		r.DecisionNote = &n
+	}
+	r.UpdatedAt = decidedAt
+	return nil
+}
+
+func (f *fakeApprovalRepo) ExpireStale(ctx context.Context, before time.Time) (int, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	now := time.Now().UTC()
+	count := 0
+	for _, r := range f.rows {
+		if r.State == domain.ApprovalStatePending && (r.CreatedAt.Before(before) || r.CreatedAt.Equal(before)) {
+			r.State = domain.ApprovalStateExpired
+			s := "system-reaper"
+			r.DecidedBy = &s
+			r.DecidedAt = &now
+			r.UpdatedAt = now
+			count++
+		}
+	}
+	return count, nil
+}
+
+// fakeJobStateRepo implements service.JobStatusUpdater and tracks per-job
+// status mutations so the tests can introspect them. It does NOT implement
+// the full repository.JobRepository — ApprovalService only needs UpdateStatus.
+type fakeJobStateRepo struct {
+	mu       sync.Mutex
+	statuses map[string]domain.JobStatus
+}
+
+func newFakeJobStateRepo() *fakeJobStateRepo {
+	return &fakeJobStateRepo{statuses: make(map[string]domain.JobStatus)}
+}
+
+func (f *fakeJobStateRepo) seed(id string, status domain.JobStatus) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.statuses[id] = status
+}
+
+func (f *fakeJobStateRepo) status(id string) domain.JobStatus {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.statuses[id]
+}
+
+func (f *fakeJobStateRepo) UpdateStatus(ctx context.Context, id string, status domain.JobStatus, errMsg string) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.statuses[id] = status
+	return nil
+}
+
+// helper builders --------------------------------------------------------
+
+func newApprovalSvcForTest(bypass bool) (*ApprovalService, *fakeApprovalRepo, *fakeJobStateRepo) {
+	ar := newFakeApprovalRepo()
+	jr := newFakeJobStateRepo()
+	metrics := NewApprovalMetrics()
+	svc := NewApprovalService(ar, jr, nil, metrics, bypass)
+	return svc, ar, jr
+}
+
+func sampleCert() *domain.ManagedCertificate {
+	return &domain.ManagedCertificate{ID: "mc-test-cert"}
+}
+
+// tests ------------------------------------------------------------------
+
+func TestApproval_RequestCreatesPendingRow_BypassDisabled(t *testing.T) {
+	svc, ar, jr := newApprovalSvcForTest(false)
+	jr.seed("job-1", domain.JobStatusAwaitingApproval)
+
+	id, err := svc.RequestApproval(context.Background(), sampleCert(),
+		"job-1", "profile-prod-cdn", "user-alice", map[string]string{"common_name": "api.example.com"})
+	if err != nil {
+		t.Fatalf("RequestApproval err: %v", err)
+	}
+	got, err := ar.Get(context.Background(), id)
+	if err != nil {
+		t.Fatalf("Get err: %v", err)
+	}
+	if got.State != domain.ApprovalStatePending {
+		t.Fatalf("expected state=pending, got %s", got.State)
+	}
+	if got.RequestedBy != "user-alice" {
+		t.Fatalf("requested_by mismatch: %s", got.RequestedBy)
+	}
+	if jr.status("job-1") != domain.JobStatusAwaitingApproval {
+		t.Fatalf("job should remain AwaitingApproval; got %s", jr.status("job-1"))
+	}
+}
+
+func TestApproval_BypassMode_AutoApprovesWithSystemBypassActor(t *testing.T) {
+	svc, ar, jr := newApprovalSvcForTest(true)
+	jr.seed("job-2", domain.JobStatusAwaitingApproval)
+
+	id, err := svc.RequestApproval(context.Background(), sampleCert(),
+		"job-2", "profile-iot", "user-bob", nil)
+	if err != nil {
+		t.Fatalf("bypass RequestApproval err: %v", err)
+	}
+	got, err := ar.Get(context.Background(), id)
+	if err != nil {
+		t.Fatalf("Get err: %v", err)
+	}
+	if got.State != domain.ApprovalStateApproved {
+		t.Fatalf("bypass should auto-approve; got state=%s", got.State)
+	}
+	if got.DecidedBy == nil || *got.DecidedBy != domain.ApprovalActorSystemBypass {
+		t.Fatalf("bypass should stamp decided_by=%s; got %v",
+			domain.ApprovalActorSystemBypass, got.DecidedBy)
+	}
+	if jr.status("job-2") != domain.JobStatusPending {
+		t.Fatalf("bypass should transition job to Pending; got %s", jr.status("job-2"))
+	}
+}
+
+func TestApproval_Approve_TransitionsJobFromAwaitingApprovalToPending(t *testing.T) {
+	svc, ar, jr := newApprovalSvcForTest(false)
+	jr.seed("job-3", domain.JobStatusAwaitingApproval)
+	id, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-3", "p1", "user-alice", nil)
+
+	if err := svc.Approve(context.Background(), id, "user-bob", "approved per ticket SECOPS-123"); err != nil {
+		t.Fatalf("Approve err: %v", err)
+	}
+	got, _ := ar.Get(context.Background(), id)
+	if got.State != domain.ApprovalStateApproved {
+		t.Fatalf("expected state=approved; got %s", got.State)
+	}
+	if jr.status("job-3") != domain.JobStatusPending {
+		t.Fatalf("expected job=Pending; got %s", jr.status("job-3"))
+	}
+}
+
+func TestApproval_Reject_TransitionsJobFromAwaitingApprovalToCancelled(t *testing.T) {
+	svc, ar, jr := newApprovalSvcForTest(false)
+	jr.seed("job-4", domain.JobStatusAwaitingApproval)
+	id, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-4", "p1", "user-alice", nil)
+
+	if err := svc.Reject(context.Background(), id, "user-bob", "not on the approved-domains list"); err != nil {
+		t.Fatalf("Reject err: %v", err)
+	}
+	got, _ := ar.Get(context.Background(), id)
+	if got.State != domain.ApprovalStateRejected {
+		t.Fatalf("expected state=rejected; got %s", got.State)
+	}
+	if jr.status("job-4") != domain.JobStatusCancelled {
+		t.Fatalf("expected job=Cancelled; got %s", jr.status("job-4"))
+	}
+}
+
+func TestApproval_Approve_RejectsSameActor(t *testing.T) {
+	// LOAD-BEARING TWO-PERSON INTEGRITY TEST. PCI-DSS 6.4.5 / NIST 800-53
+	// SA-15 / SOC 2 CC6.1 compliance auditors pattern-match against this.
+	svc, _, jr := newApprovalSvcForTest(false)
+	jr.seed("job-5", domain.JobStatusAwaitingApproval)
+	id, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-5", "p1", "user-alice", nil)
+
+	err := svc.Approve(context.Background(), id, "user-alice", "trying to self-approve")
+	if !errors.Is(err, ErrApproveBySameActor) {
+		t.Fatalf("expected ErrApproveBySameActor; got %v", err)
+	}
+	if jr.status("job-5") != domain.JobStatusAwaitingApproval {
+		t.Fatalf("job should remain AwaitingApproval; got %s", jr.status("job-5"))
+	}
+
+	// Approval as a different actor succeeds.
+	if err := svc.Approve(context.Background(), id, "user-bob", "approved by separate actor"); err != nil {
+		t.Fatalf("Approve as different actor err: %v", err)
+	}
+	if jr.status("job-5") != domain.JobStatusPending {
+		t.Fatalf("expected job=Pending after bob approve; got %s", jr.status("job-5"))
+	}
+
+	// Same-actor rejection also fails.
+	jr.seed("job-5b", domain.JobStatusAwaitingApproval)
+	id2, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-5b", "p1", "user-charlie", nil)
+	err2 := svc.Reject(context.Background(), id2, "user-charlie", "self-reject")
+	if !errors.Is(err2, ErrApproveBySameActor) {
+		t.Fatalf("expected ErrApproveBySameActor on Reject; got %v", err2)
+	}
+}
+
+func TestApproval_Approve_RejectsAlreadyDecided(t *testing.T) {
+	svc, _, jr := newApprovalSvcForTest(false)
+	jr.seed("job-6", domain.JobStatusAwaitingApproval)
+	id, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-6", "p1", "user-alice", nil)
+	if err := svc.Approve(context.Background(), id, "user-bob", ""); err != nil {
+		t.Fatalf("first Approve err: %v", err)
+	}
+
+	err := svc.Approve(context.Background(), id, "user-charlie", "second approve")
+	if !errors.Is(err, ErrApprovalAlreadyDecided) {
+		t.Fatalf("expected ErrApprovalAlreadyDecided; got %v", err)
+	}
+	err2 := svc.Reject(context.Background(), id, "user-charlie", "late reject")
+	if !errors.Is(err2, ErrApprovalAlreadyDecided) {
+		t.Fatalf("expected ErrApprovalAlreadyDecided on Reject; got %v", err2)
+	}
+}
+
+func TestApproval_ExpireStale_TransitionsPendingToExpired_AndCancelsJob(t *testing.T) {
+	svc, ar, jr := newApprovalSvcForTest(false)
+	jr.seed("job-7", domain.JobStatusAwaitingApproval)
+	jr.seed("job-8", domain.JobStatusAwaitingApproval)
+	id7, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-7", "p1", "user-alice", nil)
+	id8, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-8", "p1", "user-alice", nil)
+
+	// Backdate one of the requests to before the cutoff.
+	old := time.Now().Add(-200 * time.Hour).UTC()
+	ar.mu.Lock()
+	ar.rows[id7].CreatedAt = old
+	ar.mu.Unlock()
+
+	cutoff := time.Now().Add(-168 * time.Hour).UTC()
+	count, err := svc.ExpireStale(context.Background(), cutoff)
+	if err != nil {
+		t.Fatalf("ExpireStale err: %v", err)
+	}
+	if count != 1 {
+		t.Fatalf("expected 1 row expired; got %d", count)
+	}
+	got7, _ := ar.Get(context.Background(), id7)
+	if got7.State != domain.ApprovalStateExpired {
+		t.Fatalf("expected job-7 expired; got %s", got7.State)
+	}
+	got8, _ := ar.Get(context.Background(), id8)
+	if got8.State != domain.ApprovalStatePending {
+		t.Fatalf("job-8 should still be pending; got %s", got8.State)
+	}
+	if jr.status("job-7") != domain.JobStatusCancelled {
+		t.Fatalf("expected job-7 cancelled; got %s", jr.status("job-7"))
+	}
+	if jr.status("job-8") != domain.JobStatusAwaitingApproval {
+		t.Fatalf("job-8 should remain AwaitingApproval; got %s", jr.status("job-8"))
+	}
+}
+
+func TestApproval_MetricCounterIncrements(t *testing.T) {
+	svc, _, jr := newApprovalSvcForTest(false)
+	metrics := svc.metrics
+
+	jr.seed("job-9", domain.JobStatusAwaitingApproval)
+	id9, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-9", "p-cdn", "user-alice", nil)
+	_ = svc.Approve(context.Background(), id9, "user-bob", "approved")
+
+	jr.seed("job-10", domain.JobStatusAwaitingApproval)
+	id10, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-10", "p-cdn", "user-alice", nil)
+	_ = svc.Reject(context.Background(), id10, "user-bob", "rejected")
+
+	jr.seed("job-11", domain.JobStatusAwaitingApproval)
+	id11, _ := svc.RequestApproval(context.Background(), sampleCert(), "job-11", "p-cdn", "user-alice", nil)
+	// Backdate + expire.
+	old := time.Now().Add(-200 * time.Hour).UTC()
+	repo := svc.approvalRepo.(*fakeApprovalRepo)
+	repo.mu.Lock()
+	repo.rows[id11].CreatedAt = old
+	repo.mu.Unlock()
+	if _, err := svc.ExpireStale(context.Background(), time.Now().Add(-168*time.Hour)); err != nil {
+		t.Fatalf("ExpireStale err: %v", err)
+	}
+
+	snap := metrics.SnapshotApprovalDecisions()
+	got := map[string]uint64{}
+	for _, e := range snap {
+		got[e.Outcome] = e.Count
+	}
+	if got[domain.ApprovalOutcomeApproved] != 1 {
+		t.Fatalf("expected 1 approved counter; got %d", got[domain.ApprovalOutcomeApproved])
+	}
+	if got[domain.ApprovalOutcomeRejected] != 1 {
+		t.Fatalf("expected 1 rejected counter; got %d", got[domain.ApprovalOutcomeRejected])
+	}
+	if got[domain.ApprovalOutcomeExpired] != 1 {
+		t.Fatalf("expected 1 expired counter; got %d", got[domain.ApprovalOutcomeExpired])
+	}
+
+	// Histogram observed at least 3 samples.
+	hist := metrics.SnapshotApprovalPendingAgeHistogram()
+	if hist.Count < 3 {
+		t.Fatalf("expected at least 3 histogram samples; got %d", hist.Count)
+	}
+}