feat(retention): COMP-002-RETENTION — federated-user PII purge pipeline

Sprint 6 closure of the audit's MED-severity COMP-002-RETENTION finding. Pre-fix posture: the federated-user admin surface (auth_users.go::Deactivate) sets users.deactivated_at on soft-delete, but the PII columns (email, display_name, oidc_subject) stay populated forever. No in-code primitive for GDPR right-to-be- forgotten; no scheduled retention purge. This commit ships the audit's recommended two-phase fix: Phase 1 — operator-callable scrub primitive internal/service/user_retention.go UserRetentionService.DeleteUserPII(ctx, userID): - revoke all active sessions (defense-in-depth) - email := 'purged@redacted.local' - display_name := '[purged]' - oidc_subject := 'sha256:' || hex(sha256(original)) - audit_events row with action=user.purge_pii, category=auth, actor=system Why hash oidc_subject instead of NULL: 1. (oidc_provider_id, oidc_subject) UNIQUE constraint would trip on multiple purged users converging to NULL 2. The hash is one-way; the original IdP-side identifier is unrecoverable. Re-login under the same subject mints a fresh u-id (right-to-be-forgotten semantics) 3. Forensic continuity: an operator can recompute sha256(<known-subject>) and confirm "this user was deactivated then purged" users.id itself is preserved so historical audit_events.actor = u-X rows still resolve. The forensic- attribution chain stays intact even after the PII is gone. Phase 2 — scheduled batch purge internal/scheduler/scheduler.go UserRetentionPurger interface + userRetentionLoop: - PurgeDeactivatedUsers enumerates every user with deactivated_at < NOW() - retention_window - DeleteUserPII per row - per-tick batch cap (default 200) keeps blast radius predictable; large backlogs spread across multiple ticks - atomic.Bool guard + 5-min per-tick context.WithTimeout Repository contract grew a single new method: internal/repository/user.go::ListDeactivatedBefore(ctx, t) internal/repository/postgres/user.go: SQL-side filter (deactivated_at IS NOT NULL AND deactivated_at < $1) ORDER BY deactivated_at ASC, cross-tenant. Configuration CERTCTL_USER_RETENTION_INTERVAL default 24h CERTCTL_USER_RETENTION_WINDOW default 30 days CERTCTL_USER_RETENTION_BATCH_CAP default 200 Test stub additions for repository.UserRepository.ListDeactivatedBefore: internal/auth/oidc/service_test.go::stubUsers internal/api/handler/auth_users_test.go::stubFullUserRepo internal/api/handler/auth_session_oidc_test.go::stubUserRepo Documentation docs/operator/privacy-and-retention.md - retention pipeline diagram (day-0 deactivate → day-N purge) - operator config table - verification runbook (4 steps with SQL) - what's NOT covered (deferred: DSAR export, api_keys cascade, retroactive audit_events.details redaction) Tests internal/service/user_retention_test.go (NEW, 4 tests): TestDeleteUserPII_ScrubsAndRevokes TestDeleteUserPII_IsIdempotent TestPurgeDeactivatedUsers_RespectsWindow TestPurgeDeactivatedUsers_BatchCap Verified locally: go vet ./... (clean) gofmt -l internal/ cmd/ (clean) go test -short -count=1 \ ./internal/service/... ./internal/scheduler/... ./internal/config/... (all green) Cross-sprint interaction: pairs with COMP-001-HASH (prior commit). The user.purge_pii audit row this service emits flows through the new hash chain, so the scrub event is itself tamper-evident. Closes COMP-002-RETENTION. Sprint 6 is complete (2/2 findings).
2026-06-07 13:51:36 +00:00 · 2026-05-16 06:18:39 +00:00
parent 43836aca7c
commit 663b14bfd8
11 changed files with 874 additions and 0 deletions
@@ -145,6 +145,16 @@ type AuditChainBreakRecorder interface {
 	RecordSuccess(rowCount int)
 }

+// UserRetentionPurger is the Sprint 6 COMP-002-RETENTION scheduler-side
+// interface. Concrete impl is *service.UserRetentionService — it walks
+// every user whose deactivated_at exceeds the retention window and
+// scrubs PII columns (email / display_name / oidc_subject hash). The
+// loop calls PurgeDeactivatedUsers on every CERTCTL_USER_RETENTION_INTERVAL
+// tick. nil = loop is not wired (deployments that disable retention).
+type UserRetentionPurger interface {
+	PurgeDeactivatedUsers(ctx context.Context) (purged, failed int, err error)
+}
+
 // JobReaperService defines the interface for job timeout reaping used by the scheduler.
 type JobReaperService interface {
 	ReapTimedOutJobs(ctx context.Context, csrTTL, approvalTTL time.Duration) error
@@ -175,6 +185,7 @@ type Scheduler struct {
 	rateLimitGC           RateLimitGarbageCollector
 	auditChainVerifier    AuditChainVerifier
 	auditChainRecorder    AuditChainBreakRecorder
+	userRetention         UserRetentionPurger
 	jobReaper             JobReaperService
 	logger                *slog.Logger

@@ -196,6 +207,7 @@ type Scheduler struct {
 	sessionGCInterval             time.Duration
 	rateLimitGCInterval           time.Duration
 	auditChainVerifyInterval      time.Duration
+	userRetentionInterval         time.Duration
 	// agentOfflineJobTTL: per-tick threshold for reaping Running jobs whose
 	// owning agent has been silent. Bundle C / Audit M-016. Defaults below.
 	agentOfflineJobTTL      time.Duration
@@ -220,6 +232,7 @@ type Scheduler struct {
 	sessionGCRunning             atomic.Bool
 	rateLimitGCRunning           atomic.Bool
 	auditChainVerifyRunning      atomic.Bool
+	userRetentionRunning         atomic.Bool

 	// Graceful shutdown: wait for in-flight work to complete
 	wg sync.WaitGroup
@@ -265,6 +278,11 @@ func NewScheduler(
 		// not dominate a quiet fleet's DB load. Operators with huge
 		// audit tables can lengthen via CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL.
 		auditChainVerifyInterval: 6 * time.Hour,
+		// Sprint 6 COMP-002-RETENTION: user PII purge cadence. Default
+		// 24h — deactivated rows persist past the retention window
+		// (default 30d) only until the next tick, which is fine for
+		// GDPR-style "delete within reasonable time" expectations.
+		userRetentionInterval: 24 * time.Hour,
 		// 5 minutes is 5×agentHealthCheckInterval default of 1m; an agent
 		// must miss multiple heartbeats before its in-flight jobs are reaped.
 		agentOfflineJobTTL: 5 * time.Minute,
@@ -469,6 +487,25 @@ func (s *Scheduler) SetAuditChainVerifyInterval(d time.Duration) {
 	s.auditChainVerifyInterval = d
 }

+// SetUserRetentionPurger wires the Sprint 6 COMP-002-RETENTION
+// user-PII-purge sweeper. Optional — nil disables the loop (deployments
+// that don't have any federated humans yet, or those that want manual
+// purge via the admin endpoint only). Concrete impl is
+// *service.UserRetentionService.
+func (s *Scheduler) SetUserRetentionPurger(p UserRetentionPurger) {
+	s.userRetention = p
+}
+
+// SetUserRetentionInterval configures the userRetentionLoop tick
+// cadence. Default 24h. Wire: CERTCTL_USER_RETENTION_INTERVAL.
+// Zero or negative values are ignored.
+func (s *Scheduler) SetUserRetentionInterval(d time.Duration) {
+	if d <= 0 {
+		return
+	}
+	s.userRetentionInterval = d
+}
+
 // SetAgentOfflineJobTTL sets the threshold past which a Running job whose
 // owning agent has gone silent is reaped to Failed. Bundle C / Audit M-016.
 // Zero or negative values are ignored (the default of 5 minutes is kept).
@@ -536,6 +573,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
 		if s.auditChainVerifier != nil {
 			loopCount++
 		}
+		if s.userRetention != nil {
+			loopCount++
+		}
 		s.wg.Add(loopCount)

 		go func() { defer s.wg.Done(); s.renewalCheckLoop(ctx) }()
@@ -573,6 +613,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
 		if s.auditChainVerifier != nil {
 			go func() { defer s.wg.Done(); s.auditChainVerifyLoop(ctx) }()
 		}
+		if s.userRetention != nil {
+			go func() { defer s.wg.Done(); s.userRetentionLoop(ctx) }()
+		}

 		// Signal that all loops are launched
 		close(startedChan)
@@ -1454,6 +1497,50 @@ func (s *Scheduler) auditChainVerifyLoop(ctx context.Context) {
 	}
 }

+// userRetentionLoop is the Sprint 6 COMP-002-RETENTION sweeper. Every
+// CERTCTL_USER_RETENTION_INTERVAL tick it asks
+// UserRetentionService.PurgeDeactivatedUsers to walk every user whose
+// deactivated_at is older than the retention window and scrub the PII
+// columns. The service is responsible for the row-level work + audit
+// emission; the loop only orchestrates cadence + concurrency control.
+//
+// Mirrors the GC-loop pattern: atomic.Bool guard prevents overlapping
+// ticks; per-tick context.WithTimeout caps the worst case at 5
+// minutes. The retention service's purgeBatchCap (default 200) is the
+// inner-loop budget — large backlogs spread across multiple ticks.
+func (s *Scheduler) userRetentionLoop(ctx context.Context) {
+	ticker := NewJitteredTicker(s.userRetentionInterval, DefaultSchedulerJitter)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			if !s.userRetentionRunning.CompareAndSwap(false, true) {
+				s.logger.Warn("user retention purge still running, skipping tick")
+				continue
+			}
+			s.wg.Add(1)
+			go func() {
+				defer s.wg.Done()
+				defer s.userRetentionRunning.Store(false)
+				opCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
+				defer cancel()
+				purged, failed, err := s.userRetention.PurgeDeactivatedUsers(opCtx)
+				if err != nil {
+					s.logger.Warn("user retention purge failed (next tick will retry)", "error", err)
+					return
+				}
+				if purged > 0 || failed > 0 {
+					s.logger.Info("user retention purge tick",
+						"purged", purged, "failed", failed)
+				}
+			}()
+		}
+	}
+}
+
 // runAuditChainVerify executes a single chain-verify pass with the
 // atomic.Bool + WithTimeout + goroutine pattern every other GC loop
 // uses. Extracted so the loop body + the "run once on start" path