mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 13:51:36 +00:00
feat(retention): COMP-002-RETENTION — federated-user PII purge pipeline
Sprint 6 closure of the audit's MED-severity COMP-002-RETENTION
finding.
Pre-fix posture: the federated-user admin surface
(auth_users.go::Deactivate) sets users.deactivated_at on soft-delete,
but the PII columns (email, display_name, oidc_subject) stay
populated forever. No in-code primitive for GDPR right-to-be-
forgotten; no scheduled retention purge.
This commit ships the audit's recommended two-phase fix:
Phase 1 — operator-callable scrub primitive
internal/service/user_retention.go
UserRetentionService.DeleteUserPII(ctx, userID):
- revoke all active sessions (defense-in-depth)
- email := 'purged@redacted.local'
- display_name := '[purged]'
- oidc_subject := 'sha256:' || hex(sha256(original))
- audit_events row with action=user.purge_pii,
category=auth, actor=system
Why hash oidc_subject instead of NULL:
1. (oidc_provider_id, oidc_subject) UNIQUE constraint would
trip on multiple purged users converging to NULL
2. The hash is one-way; the original IdP-side identifier is
unrecoverable. Re-login under the same subject mints a
fresh u-id (right-to-be-forgotten semantics)
3. Forensic continuity: an operator can recompute
sha256(<known-subject>) and confirm "this user was
deactivated then purged"
users.id itself is preserved so historical
audit_events.actor = u-X rows still resolve. The forensic-
attribution chain stays intact even after the PII is gone.
Phase 2 — scheduled batch purge
internal/scheduler/scheduler.go
UserRetentionPurger interface + userRetentionLoop:
- PurgeDeactivatedUsers enumerates every user with
deactivated_at < NOW() - retention_window
- DeleteUserPII per row
- per-tick batch cap (default 200) keeps blast radius
predictable; large backlogs spread across multiple ticks
- atomic.Bool guard + 5-min per-tick context.WithTimeout
Repository contract grew a single new method:
internal/repository/user.go::ListDeactivatedBefore(ctx, t)
internal/repository/postgres/user.go: SQL-side filter
(deactivated_at IS NOT NULL AND deactivated_at < $1)
ORDER BY deactivated_at ASC, cross-tenant.
Configuration
CERTCTL_USER_RETENTION_INTERVAL default 24h
CERTCTL_USER_RETENTION_WINDOW default 30 days
CERTCTL_USER_RETENTION_BATCH_CAP default 200
Test stub additions for repository.UserRepository.ListDeactivatedBefore:
internal/auth/oidc/service_test.go::stubUsers
internal/api/handler/auth_users_test.go::stubFullUserRepo
internal/api/handler/auth_session_oidc_test.go::stubUserRepo
Documentation
docs/operator/privacy-and-retention.md
- retention pipeline diagram (day-0 deactivate → day-N purge)
- operator config table
- verification runbook (4 steps with SQL)
- what's NOT covered (deferred: DSAR export, api_keys cascade,
retroactive audit_events.details redaction)
Tests
internal/service/user_retention_test.go (NEW, 4 tests):
TestDeleteUserPII_ScrubsAndRevokes
TestDeleteUserPII_IsIdempotent
TestPurgeDeactivatedUsers_RespectsWindow
TestPurgeDeactivatedUsers_BatchCap
Verified locally:
go vet ./... (clean)
gofmt -l internal/ cmd/ (clean)
go test -short -count=1 \
./internal/service/... ./internal/scheduler/... ./internal/config/...
(all green)
Cross-sprint interaction: pairs with COMP-001-HASH (prior commit).
The user.purge_pii audit row this service emits flows through the
new hash chain, so the scrub event is itself tamper-evident.
Closes COMP-002-RETENTION. Sprint 6 is complete (2/2 findings).
This commit is contained in:
@@ -145,6 +145,16 @@ type AuditChainBreakRecorder interface {
|
||||
RecordSuccess(rowCount int)
|
||||
}
|
||||
|
||||
// UserRetentionPurger is the Sprint 6 COMP-002-RETENTION scheduler-side
|
||||
// interface. Concrete impl is *service.UserRetentionService — it walks
|
||||
// every user whose deactivated_at exceeds the retention window and
|
||||
// scrubs PII columns (email / display_name / oidc_subject hash). The
|
||||
// loop calls PurgeDeactivatedUsers on every CERTCTL_USER_RETENTION_INTERVAL
|
||||
// tick. nil = loop is not wired (deployments that disable retention).
|
||||
type UserRetentionPurger interface {
|
||||
PurgeDeactivatedUsers(ctx context.Context) (purged, failed int, err error)
|
||||
}
|
||||
|
||||
// JobReaperService defines the interface for job timeout reaping used by the scheduler.
|
||||
type JobReaperService interface {
|
||||
ReapTimedOutJobs(ctx context.Context, csrTTL, approvalTTL time.Duration) error
|
||||
@@ -175,6 +185,7 @@ type Scheduler struct {
|
||||
rateLimitGC RateLimitGarbageCollector
|
||||
auditChainVerifier AuditChainVerifier
|
||||
auditChainRecorder AuditChainBreakRecorder
|
||||
userRetention UserRetentionPurger
|
||||
jobReaper JobReaperService
|
||||
logger *slog.Logger
|
||||
|
||||
@@ -196,6 +207,7 @@ type Scheduler struct {
|
||||
sessionGCInterval time.Duration
|
||||
rateLimitGCInterval time.Duration
|
||||
auditChainVerifyInterval time.Duration
|
||||
userRetentionInterval time.Duration
|
||||
// agentOfflineJobTTL: per-tick threshold for reaping Running jobs whose
|
||||
// owning agent has been silent. Bundle C / Audit M-016. Defaults below.
|
||||
agentOfflineJobTTL time.Duration
|
||||
@@ -220,6 +232,7 @@ type Scheduler struct {
|
||||
sessionGCRunning atomic.Bool
|
||||
rateLimitGCRunning atomic.Bool
|
||||
auditChainVerifyRunning atomic.Bool
|
||||
userRetentionRunning atomic.Bool
|
||||
|
||||
// Graceful shutdown: wait for in-flight work to complete
|
||||
wg sync.WaitGroup
|
||||
@@ -265,6 +278,11 @@ func NewScheduler(
|
||||
// not dominate a quiet fleet's DB load. Operators with huge
|
||||
// audit tables can lengthen via CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL.
|
||||
auditChainVerifyInterval: 6 * time.Hour,
|
||||
// Sprint 6 COMP-002-RETENTION: user PII purge cadence. Default
|
||||
// 24h — deactivated rows persist past the retention window
|
||||
// (default 30d) only until the next tick, which is fine for
|
||||
// GDPR-style "delete within reasonable time" expectations.
|
||||
userRetentionInterval: 24 * time.Hour,
|
||||
// 5 minutes is 5×agentHealthCheckInterval default of 1m; an agent
|
||||
// must miss multiple heartbeats before its in-flight jobs are reaped.
|
||||
agentOfflineJobTTL: 5 * time.Minute,
|
||||
@@ -469,6 +487,25 @@ func (s *Scheduler) SetAuditChainVerifyInterval(d time.Duration) {
|
||||
s.auditChainVerifyInterval = d
|
||||
}
|
||||
|
||||
// SetUserRetentionPurger wires the Sprint 6 COMP-002-RETENTION
|
||||
// user-PII-purge sweeper. Optional — nil disables the loop (deployments
|
||||
// that don't have any federated humans yet, or those that want manual
|
||||
// purge via the admin endpoint only). Concrete impl is
|
||||
// *service.UserRetentionService.
|
||||
func (s *Scheduler) SetUserRetentionPurger(p UserRetentionPurger) {
|
||||
s.userRetention = p
|
||||
}
|
||||
|
||||
// SetUserRetentionInterval configures the userRetentionLoop tick
|
||||
// cadence. Default 24h. Wire: CERTCTL_USER_RETENTION_INTERVAL.
|
||||
// Zero or negative values are ignored.
|
||||
func (s *Scheduler) SetUserRetentionInterval(d time.Duration) {
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
s.userRetentionInterval = d
|
||||
}
|
||||
|
||||
// SetAgentOfflineJobTTL sets the threshold past which a Running job whose
|
||||
// owning agent has gone silent is reaped to Failed. Bundle C / Audit M-016.
|
||||
// Zero or negative values are ignored (the default of 5 minutes is kept).
|
||||
@@ -536,6 +573,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
|
||||
if s.auditChainVerifier != nil {
|
||||
loopCount++
|
||||
}
|
||||
if s.userRetention != nil {
|
||||
loopCount++
|
||||
}
|
||||
s.wg.Add(loopCount)
|
||||
|
||||
go func() { defer s.wg.Done(); s.renewalCheckLoop(ctx) }()
|
||||
@@ -573,6 +613,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
|
||||
if s.auditChainVerifier != nil {
|
||||
go func() { defer s.wg.Done(); s.auditChainVerifyLoop(ctx) }()
|
||||
}
|
||||
if s.userRetention != nil {
|
||||
go func() { defer s.wg.Done(); s.userRetentionLoop(ctx) }()
|
||||
}
|
||||
|
||||
// Signal that all loops are launched
|
||||
close(startedChan)
|
||||
@@ -1454,6 +1497,50 @@ func (s *Scheduler) auditChainVerifyLoop(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// userRetentionLoop is the Sprint 6 COMP-002-RETENTION sweeper. Every
|
||||
// CERTCTL_USER_RETENTION_INTERVAL tick it asks
|
||||
// UserRetentionService.PurgeDeactivatedUsers to walk every user whose
|
||||
// deactivated_at is older than the retention window and scrub the PII
|
||||
// columns. The service is responsible for the row-level work + audit
|
||||
// emission; the loop only orchestrates cadence + concurrency control.
|
||||
//
|
||||
// Mirrors the GC-loop pattern: atomic.Bool guard prevents overlapping
|
||||
// ticks; per-tick context.WithTimeout caps the worst case at 5
|
||||
// minutes. The retention service's purgeBatchCap (default 200) is the
|
||||
// inner-loop budget — large backlogs spread across multiple ticks.
|
||||
func (s *Scheduler) userRetentionLoop(ctx context.Context) {
|
||||
ticker := NewJitteredTicker(s.userRetentionInterval, DefaultSchedulerJitter)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
if !s.userRetentionRunning.CompareAndSwap(false, true) {
|
||||
s.logger.Warn("user retention purge still running, skipping tick")
|
||||
continue
|
||||
}
|
||||
s.wg.Add(1)
|
||||
go func() {
|
||||
defer s.wg.Done()
|
||||
defer s.userRetentionRunning.Store(false)
|
||||
opCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
defer cancel()
|
||||
purged, failed, err := s.userRetention.PurgeDeactivatedUsers(opCtx)
|
||||
if err != nil {
|
||||
s.logger.Warn("user retention purge failed (next tick will retry)", "error", err)
|
||||
return
|
||||
}
|
||||
if purged > 0 || failed > 0 {
|
||||
s.logger.Info("user retention purge tick",
|
||||
"purged", purged, "failed", failed)
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runAuditChainVerify executes a single chain-verify pass with the
|
||||
// atomic.Bool + WithTimeout + goroutine pattern every other GC loop
|
||||
// uses. Extracted so the loop body + the "run once on start" path
|
||||
|
||||
Reference in New Issue
Block a user