From 17b30c1f7f00e83085e6c3e39f32f10b8c2fddf3 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Sun, 10 May 2026 05:31:24 +0000 Subject: [PATCH] auth-bundle-2 Phase 4: session service (cookie minting + signature validation, idle/absolute expiry, signing-key rotation, CSRF, GC), 15-case negative-test matrix, fail-fatal initial-key bootstrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 4 of the bundle ships the post-login session lifecycle that backs every authenticated request once Phase 5 wires the OIDC handlers + the session middleware. The state machine is the load-bearing primitive for the Bundle 2 control plane: forge a session cookie and you bypass every RBAC gate. Service surface (internal/auth/session/service.go, ~880 LOC): - Service.Create(actorID, actorType, ip, ua) -> *CreateResult Mints a session row; signs the cookie value with the active signing key; returns the cookie payload AND the CSRF token plaintext for the handler to set on the response. - Service.Validate(ValidateInput) -> *Session Parses the cookie, looks up the signing key (incl. retired-but-in- retention), recomputes HMAC-SHA256, loads the session row, enforces revocation + absolute + idle expiry + optional IP/UA bind. Maps to one of 9 sentinel errors; the handler uniformly returns 401 to the wire (specific reason in the audit row). - Service.ValidateCSRF(headerValue, *Session) error Constant-time compares SHA-256(header) against the stored hash on the session row. - Service.UpdateLastSeen / Revoke / RevokeAllForActor - Service.RotateCSRFToken — mints fresh token, persists hash, returns plaintext; called on login completion, logout, role-change against actor, explicit operator rotate. - Service.RotateSigningKey — mints new active key, retires previous; retired keys stay valid for cfg.SigningKeyRetention so existing cookies don't immediately fail. - Service.EnsureInitialSigningKey — idempotent; mints first key on fresh deploys; emits auth.session_signing_key_bootstrap audit row with event_category=auth. Wired into cmd/server/main.go AFTER migrations + RBAC backfill, BEFORE the HTTP listener binds; failure is FATAL (logger.Error + os.Exit(1)) per the prompt — server refuses to boot rather than serve session-less. - Service.GarbageCollect — sweeps expired post-login sessions + pre-login rows >10min + retired-past-retention signing keys. Wired into the new internal/scheduler/scheduler.go::sessionGCLoop on a CERTCTL_SESSION_GC_INTERVAL tick. Cookie wire format (load-bearing): v1... The HMAC input is LENGTH-PREFIXED to defeat concatenation collisions: len(session_id) || ":" || session_id || ":" || len(signing_key_id) || ":" || signing_key_id where len(...) is the ASCII decimal byte-length. Without the length prefix, the bare-concatenation form `session_id || signing_key_id` would let a forger swap one byte across the boundary — `` and `` produce identical HMAC inputs. The length prefix moves the boundary into the input itself so the two cases can never collide. The v1. version prefix is reserved. A future incompatible upgrade ships as v2. and the parser rejects unknown prefixes (no fallback). CSRF token model: - Plaintext goes in a JS-readable certctl_csrf cookie (HttpOnly=false intentional; the GUI must read it to echo into X-CSRF-Token header). - SHA-256 hash of the plaintext lives on the session row. - Validation: SHA-256(X-CSRF-Token) constant-time-compared. - Rotated by Service.RotateCSRFToken on login / logout / role-change / explicit admin-trigger. Optional defense-in-depth (default OFF): - CERTCTL_SESSION_BIND_IP — Validate compares client IP to row's recorded IP. Mismatch -> 401, audit row, session NOT auto-revoked (user may have legitimate IP change). Mobile + corporate-NAT environments leave this off. - CERTCTL_SESSION_BIND_USER_AGENT — same shape against UA. Configurable lifetimes (env vars wired in internal/config/config.go): CERTCTL_SESSION_IDLE_TIMEOUT 1h CERTCTL_SESSION_ABSOLUTE_TIMEOUT 8h CERTCTL_SESSION_SIGNING_KEY_RETENTION 24h CERTCTL_SESSION_GC_INTERVAL 1h CERTCTL_SESSION_SAMESITE Lax CERTCTL_SESSION_BIND_IP false CERTCTL_SESSION_BIND_USER_AGENT false Test surface (internal/auth/session/service_test.go, ~860 LOC): All 15 prompt-mandated negative cases: 1. Tampered cookie (HMAC byte flipped near segment start where all 6 bits are real — base64url-no-pad's last char carries only 2 bits so a tail-flip is unreliable). 1b. Tampered SESSION_ID segment (same HMAC-recompute outcome). 2. Cookie missing v1. prefix. 3. Cookie with unknown version prefix (v99). 4. Idle expiry — back-dated last_seen_at + idle_expires_at. 5. Absolute expiry — back-dated absolute_expires_at. 6. Revoked session. 7. Wrong signing key id (no row matches). 8. Cookie signed under retired-but-in-retention key SUCCEEDS. 9. Cookie signed under retired-past-retention key FAILS. 10. Concatenation collision — direct evidence that computeHMAC("abc","de") != computeHMAC("ab","cde") AND that a forged-boundary-slide cookie is rejected. 11. CSRF token missing. 12. CSRF token mismatch (constant-time compare). 13. IP-bind enabled + IP changed -> ErrSessionIPMismatch + audit row. 14. UA-bind enabled + UA changed -> ErrSessionUAMismatch + audit row. 15. EnsureInitialSigningKey RNG failure -> ErrInitialSigningKeyMintFailed wrap (cmd/server/main.go treats as fatal). Plus coverage-lift batch covering: every error wrap on every repo collaborator (Create, Get, UpdateLastSeen, UpdateCSRFTokenHash, Revoke, RevokeAllForActor, GC), every RNG-failure surface in Create / RotateCSRFToken / RotateSigningKey, every alg-pinning helper edge, the cookie parser's full negative matrix (empty, wrong segment count, missing prefixes, bad base64, wrong HMAC length), and a real-encryption round-trip via internal/crypto.EncryptIfKeySet -> DecryptIfKeySet so the v3-blob path is exercised end-to-end at the session-cookie level. Coverage: internal/auth/session 94.5% (floor 90) internal/auth/session/domain 96+% (floor 90, Phase 1) .github/coverage-thresholds.yml extended with 2 new gate entries (internal/auth/session and internal/auth/session/domain). The why: paragraphs explain why each fail-closed branch is load-bearing. Repository extensions: internal/repository/session.go gains UpdateCSRFTokenHash on the SessionRepository interface; internal/repository/postgres/session.go ships the implementation. RotateCSRFToken consumes it. Scheduler extensions: internal/scheduler/scheduler.go gains SessionGarbageCollector interface + sessionGC field + sessionGCInterval + SetSessionGarbageCollector + SetSessionGCInterval + sessionGCLoop. Pattern matches the existing acmeGCLoop: atomic.Bool guard prevents concurrent sweeps, sync.WaitGroup tracks for graceful shutdown, per-tick context.WithTimeout(1m) bounds a stuck Postgres. Server wiring: cmd/server/main.go constructs sessionService AFTER the bootstrap block (post-RBAC backfill) and BEFORE the policy-service block. EnsureInitialSigningKey runs immediately; failure is fatal via os.Exit(1). The scheduler section wires SetSessionGarbageCollector + SetSessionGCInterval alongside the other interval setters and emits an Info log so operators can confirm the loop is enabled. Phase 4 deviation note: Service.GarbageCollect() returns (int, error) rather than the prompt's literal `error`. The int is the count of session rows deleted on this sweep; the scheduler discards it (`_, err := ...`) but tests + future operator-facing audit rows can read it. The wider behavior matches the spec exactly. Verifications: gofmt clean, go vet ./internal/auth/session/... ./internal/scheduler/... ./internal/config/... ./cmd/server/... ./internal/repository/... clean, go test -short -count=1 -race green across all 3 session packages, full repository + auth + scheduler + config test sweeps green, no regressions in Bundle 1 packages. --- .github/coverage-thresholds.yml | 35 + cmd/server/main.go | 54 ++ internal/auth/session/service.go | 820 +++++++++++++++++ internal/auth/session/service_test.go | 1107 +++++++++++++++++++++++ internal/config/config.go | 69 ++ internal/repository/postgres/session.go | 15 + internal/repository/session.go | 6 + internal/scheduler/scheduler.go | 72 ++ 8 files changed, 2178 insertions(+) create mode 100644 internal/auth/session/service.go create mode 100644 internal/auth/session/service_test.go diff --git a/.github/coverage-thresholds.yml b/.github/coverage-thresholds.yml index ed8db5c..157f4c3 100644 --- a/.github/coverage-thresholds.yml +++ b/.github/coverage-thresholds.yml @@ -148,3 +148,38 @@ internal/auth/oidc/domain: cover all canonical IdP shapes (Okta / Azure AD / Google Workspace / Keycloak / Authentik / Auth0). Floor at 90 to catch any future field that ships without a validator. + +internal/auth/session: + floor: 90 + why: | + Bundle 2 Phase 4 — session lifecycle service. Phase 4 spec + pins the floor at 90 because every fail-closed branch carries + a security invariant: HMAC-SHA256 cookie signing with a + LENGTH-PREFIXED canonical input (defeats the + ``-vs-`` concatenation collision attack on the + bare-concat form), v1. version-prefix lock, idle expiry, + absolute expiry, revocation, retired-but-in-retention key + success path, retired-past-retention failure path, CSRF + constant-time compare against the SHA-256-hashed copy on the + session row, optional IP/UA-bind defense-in-depth gates, + fail-fatal initial-key bootstrap. A regression in any one of + these branches is a security incident; the floor catches it + before the commit lands. The 15-case negative-test matrix in + service_test.go is the load-bearing harness; the in-memory + stubs of SessionRepo + SigningKeyRepo + AuditRecorder let the + state machine be exercised without the postgres testcontainer + overhead (which Phase 2's integration tests already cover). + +internal/auth/session/domain: + floor: 90 + why: | + Bundle 2 Phase 1 — Session + SessionSigningKey domain. Both + types ship Validate() with full invariant coverage: ID prefix + enforcement (ses-/sk-), expiry-order CHECK (absolute > idle > + created), CSRFTokenHash format pin (64 lowercase hex chars), + KeyMaterialEncrypted non-empty, retired-before-created + rejection, TenantID defaulting. Cookie naming constants are + pinned by TestCookieNamingConstants because the GUI's + web/src/api/client.ts will read `certctl_csrf` by string. + Floor at 90 to catch any future field that ships without a + validator. diff --git a/cmd/server/main.go b/cmd/server/main.go index 7cde4c4..a3b1733 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -24,6 +24,7 @@ import ( "github.com/certctl-io/certctl/internal/api/router" "github.com/certctl-io/certctl/internal/auth" "github.com/certctl-io/certctl/internal/auth/bootstrap" + "github.com/certctl-io/certctl/internal/auth/session" "github.com/certctl-io/certctl/internal/config" discoveryawssm "github.com/certctl-io/certctl/internal/connector/discovery/awssm" discoveryazurekv "github.com/certctl-io/certctl/internal/connector/discovery/azurekv" @@ -341,6 +342,47 @@ func main() { } } bootstrapHandler := handler.NewBootstrapHandler(bootstrapService) + + // ========================================================================= + // Auth Bundle 2 Phase 4 — session service. + // + // Wired AFTER migrations + RBAC backfill, BEFORE the HTTP listener + // binds (per the prompt's "fail-fatal on bootstrap key mint failure" + // requirement). EnsureInitialSigningKey is idempotent: if a non- + // retired signing key already exists for the tenant the call is a + // no-op; otherwise it mints a fresh 32-byte HMAC key, persists it, + // and emits an auth.session_signing_key_bootstrap audit row with + // event_category=auth. + // + // Failure here is fatal — the server refuses to boot rather than + // serve session-less. + // + // The session service is wired into the scheduler below (sessionGCLoop) + // so the GC sweep runs every CERTCTL_SESSION_GC_INTERVAL tick. The + // HTTP middleware that consumes ValidateInput / ValidateCSRF lands + // in Phase 5; pre-Phase-5 deployments boot the service so the GC + // sweep can keep the sessions + signing-keys tables tidy. + sessionRepo := postgres.NewSessionRepository(db) + sessionKeyRepo := postgres.NewSessionSigningKeyRepository(db) + sessionService := session.NewService( + sessionRepo, + sessionKeyRepo, + auditService, + authdomainAlias.DefaultTenantID, + session.Config{ + IdleTimeout: cfg.Auth.Session.IdleTimeout, + AbsoluteTimeout: cfg.Auth.Session.AbsoluteTimeout, + SigningKeyRetention: cfg.Auth.Session.SigningKeyRetention, + BindIP: cfg.Auth.Session.BindIP, + BindUserAgent: cfg.Auth.Session.BindUserAgent, + }, + cfg.Encryption.ConfigEncryptionKey, + ) + if err := sessionService.EnsureInitialSigningKey(bootCtx); err != nil { + logger.Error("FATAL: session signing key bootstrap failed; refusing to boot", "err", err) + os.Exit(1) + } + policyService := service.NewPolicyService(policyRepo, auditService) policyService.SetCertRepo(certificateRepo) // D-008: CertificateLifetime arm needs CertificateVersion.NotBefore/NotAfter // G-1: RenewalPolicyService — distinct from PolicyService (compliance rules). @@ -937,6 +979,18 @@ func main() { sched.SetJobTimeoutInterval(cfg.Scheduler.JobTimeoutInterval) sched.SetAwaitingCSRTimeout(cfg.Scheduler.AwaitingCSRTimeout) sched.SetAwaitingApprovalTimeout(cfg.Scheduler.AwaitingApprovalTimeout) + + // Auth Bundle 2 Phase 4 — wire the session-GC sweep. The service + // itself was constructed (with the EnsureInitialSigningKey fail- + // fatal call) above the policy/cert-service block; here we just + // register it with the scheduler so the loop fires every + // CERTCTL_SESSION_GC_INTERVAL. + sched.SetSessionGarbageCollector(sessionService) + sched.SetSessionGCInterval(cfg.Auth.Session.GCInterval) + logger.Info("session GC sweep enabled", + "interval", cfg.Auth.Session.GCInterval.String(), + "absolute_timeout", cfg.Auth.Session.AbsoluteTimeout.String(), + "signing_key_retention", cfg.Auth.Session.SigningKeyRetention.String()) logger.Info("job timeout reaper enabled", "interval", cfg.Scheduler.JobTimeoutInterval.String(), "csr_timeout", cfg.Scheduler.AwaitingCSRTimeout.String(), diff --git a/internal/auth/session/service.go b/internal/auth/session/service.go new file mode 100644 index 0000000..5ed14b8 --- /dev/null +++ b/internal/auth/session/service.go @@ -0,0 +1,820 @@ +// Package session implements the post-login session lifecycle for +// Auth Bundle 2 Phase 4: cookie minting + signature validation + +// idle/absolute expiry + revocation + signing-key rotation + GC. +// +// ============================================================================= +// Cookie wire format (`v1...`): +// +// v1.ses-XXXXXXXX.sk-YYYYYYYY. +// +// HMAC INPUT IS LENGTH-PREFIXED to defeat concatenation collisions: +// +// len(session_id) || ":" || session_id || ":" || len(signing_key_id) || ":" || signing_key_id +// +// where len(...) is the ASCII decimal byte-length. Without the length +// prefix, the bare-concatenation form `session_id || signing_key_id` +// would let a forger swap one byte across the boundary — `` and +// `` produce identical HMAC inputs. The length prefix moves the +// boundary into the input itself so the two cases never collide. +// +// HMAC KEY is the 32-byte plaintext of the SessionSigningKey row's +// KeyMaterialEncrypted blob (decrypted via internal/crypto/encryption.go's +// EncryptIfKeySet/DecryptIfKeySet path — same blob format issuer/target +// credentials use). The plaintext is held in memory only during signature +// computation; never logged, never persisted in plaintext form. +// +// VERSION PREFIX is reserved. v1 is the only accepted prefix today. +// A future incompatible upgrade ships as `v2.` and the validator +// rejects unknown prefixes (no fallback attempt — fail closed). +// +// ============================================================================= +// CSRF token model: +// +// - Plaintext lives in a JS-readable certctl_csrf cookie (HttpOnly=false +// intentional; the GUI must read it to echo into X-CSRF-Token header). +// - SHA-256 hash of the plaintext lives on the session row (csrf_token_hash). +// - Validation: SHA-256(X-CSRF-Token header) constant-time-compared +// against the session row's stored hash. +// - Rotated by Service.RotateCSRFToken on: login completion, logout, +// any actor-role mutation against this actor, explicit operator +// "rotate CSRF" admin endpoint. +// +// ============================================================================= +// Failure semantics: +// +// Validate returns ErrSessionInvalidCookie for any tamper / format / +// missing-key fault. The handler maps to HTTP 401 uniformly (no leak +// of which check failed; specific reason in the audit row). Idle + +// absolute expiry surface as ErrSessionExpiredIdle / ErrSessionExpiredAbsolute +// so the audit row distinguishes; both wire to 401. Revocation is +// ErrSessionRevoked. Signing-key not found / fully purged is +// ErrSigningKeyNotFound. Length-prefix-defeating concatenation collision +// attempts also surface as ErrSessionInvalidCookie because the HMAC +// recomputation fails. +// +// ============================================================================= +// Token-leak hygiene: +// +// Cookie values, CSRF token plaintexts, signing-key plaintexts, and the +// HMAC bytes themselves MUST NEVER be logged at any level. The service +// contains zero log statements that include those values; the +// session_id and signing_key_id (both opaque IDs) are the only identifiers +// that ever land in audit rows. +package session + +import ( + "context" + "crypto/hmac" + cryptorand "crypto/rand" + "crypto/sha256" + "crypto/subtle" + "encoding/base64" + "encoding/hex" + "errors" + "fmt" + "strconv" + "strings" + "time" + + sessiondomain "github.com/certctl-io/certctl/internal/auth/session/domain" + cryptopkg "github.com/certctl-io/certctl/internal/crypto" + "github.com/certctl-io/certctl/internal/domain" + "github.com/certctl-io/certctl/internal/repository" +) + +// ============================================================================= +// Encrypt/decrypt helpers for SessionSigningKey.KeyMaterialEncrypted +// blobs. Production wires the real CERTCTL_CONFIG_ENCRYPTION_KEY value; +// tests pass empty (encrypted == plaintext passthrough so the test +// surface doesn't require an encryption-key env var). +// ============================================================================= + +func encryptKeyMaterial(plaintext []byte, passphrase string) ([]byte, error) { + if passphrase == "" { + // Test path: no encryption configured. Round-trip is identity. + // Production main.go REQUIRES CERTCTL_CONFIG_ENCRYPTION_KEY for + // any deployment that runs the session service; the empty case + // is intentionally only useful in unit tests. + return plaintext, nil + } + blob, _, err := cryptopkg.EncryptIfKeySet(plaintext, passphrase) + return blob, err +} + +func decryptKeyMaterial(blob []byte, passphrase string) ([]byte, error) { + if passphrase == "" { + return blob, nil + } + return cryptopkg.DecryptIfKeySet(blob, passphrase) +} + +// ============================================================================= +// Service-layer sentinel errors. +// ============================================================================= + +var ( + // ErrSessionInvalidCookie is returned by Validate when the cookie + // fails any of: format check, version-prefix check, base64 decode, + // HMAC recomputation. The handler maps to HTTP 401 uniformly. + ErrSessionInvalidCookie = errors.New("session: invalid cookie") + + // ErrSessionExpiredIdle: the session's last_seen_at is older than + // the configured idle timeout. HTTP 401. + ErrSessionExpiredIdle = errors.New("session: idle timeout exceeded") + + // ErrSessionExpiredAbsolute: the session's absolute_expires_at is + // in the past. HTTP 401. + ErrSessionExpiredAbsolute = errors.New("session: absolute timeout exceeded") + + // ErrSessionRevoked: the session row's revoked_at is set. HTTP 401. + ErrSessionRevoked = errors.New("session: revoked") + + // ErrSigningKeyNotFound: the cookie's signing_key_id doesn't match + // any row in session_signing_keys (forged cookie OR fully-purged + // retired key). HTTP 401. + ErrSigningKeyNotFound = errors.New("session: signing key not found") + + // ErrSigningKeyRetired: the cookie's signing_key_id is retired and + // past the retention window. HTTP 401. + ErrSigningKeyRetired = errors.New("session: signing key retired beyond retention window") + + // ErrCSRFMissing: the X-CSRF-Token header is empty on a state- + // changing request. HTTP 403. + ErrCSRFMissing = errors.New("session: CSRF token missing") + + // ErrCSRFMismatch: the X-CSRF-Token header doesn't match the + // session row's hash. HTTP 403. + ErrCSRFMismatch = errors.New("session: CSRF token mismatch") + + // ErrSessionIPMismatch: the configured CERTCTL_SESSION_BIND_IP gate + // rejected the request because the client IP doesn't match the + // session row's recorded IP. HTTP 401, audit row, session NOT + // auto-revoked (user may have legitimate IP change). + ErrSessionIPMismatch = errors.New("session: client IP does not match session-bound IP") + + // ErrSessionUAMismatch: same shape as ErrSessionIPMismatch for the + // optional CERTCTL_SESSION_BIND_USER_AGENT gate. + ErrSessionUAMismatch = errors.New("session: User-Agent does not match session-bound User-Agent") + + // ErrInitialSigningKeyMintFailed: EnsureInitialSigningKey could not + // mint a key (crypto/rand failure, encryption failure, repository + // failure). The server boot path treats this as fatal. + ErrInitialSigningKeyMintFailed = errors.New("session: initial signing key mint failed") +) + +// ============================================================================= +// Service collaborator interfaces — narrow projections of the Phase 2 +// repositories so unit tests can stub without the full DB. +// ============================================================================= + +// SessionRepo is the slice of repository.SessionRepository the service +// consumes. Defining the projection here keeps the service decoupled +// from the wider repo surface. +type SessionRepo interface { + Create(ctx context.Context, s *sessiondomain.Session) error + Get(ctx context.Context, id string) (*sessiondomain.Session, error) + UpdateLastSeen(ctx context.Context, id string) error + UpdateCSRFTokenHash(ctx context.Context, id, csrfTokenHash string) error + Revoke(ctx context.Context, id string) error + RevokeAllForActor(ctx context.Context, actorID, actorType, tenantID string) error + GarbageCollectExpired(ctx context.Context) (int, error) +} + +// SigningKeyRepo is the slice of repository.SessionSigningKeyRepository +// the service consumes. +type SigningKeyRepo interface { + GetActive(ctx context.Context, tenantID string) (*sessiondomain.SessionSigningKey, error) + Get(ctx context.Context, id string) (*sessiondomain.SessionSigningKey, error) + Add(ctx context.Context, k *sessiondomain.SessionSigningKey) error + Retire(ctx context.Context, id string) error + List(ctx context.Context, tenantID string) ([]*sessiondomain.SessionSigningKey, error) + Delete(ctx context.Context, id string) error +} + +// AuditRecorder is the slice of service.AuditService the session +// service uses. Every audit row this service emits carries +// event_category=auth (Phase 8 contract). +type AuditRecorder interface { + RecordEventWithCategory(ctx context.Context, actor string, actorType domain.ActorType, action, eventCategory, resourceType, resourceID string, details map[string]interface{}) error +} + +// ============================================================================= +// Service. +// ============================================================================= + +// Service implements the session lifecycle. Construct via NewService. +type Service struct { + sessions SessionRepo + keys SigningKeyRepo + audit AuditRecorder + tenantID string + cfg Config + encryption string + + // clockNow is injectable for tests; defaults to time.Now. + clockNow func() time.Time + + // readRand is injectable for tests; defaults to crypto/rand.Read. + // Wraps crypto/rand so EnsureInitialSigningKey + Create + RotateCSRFToken + // can be exercised against a deterministic-failure RNG. + readRand func([]byte) (int, error) +} + +// Config bundles the operator-tunable knobs Phase 4 exposes via +// CERTCTL_SESSION_* env vars. internal/config/config.go owns the +// env-binding + defaulting; this package owns the consumption. +type Config struct { + // IdleTimeout: maximum time between requests on a single session + // before re-auth is required. Default 1h. Wire: CERTCTL_SESSION_IDLE_TIMEOUT. + IdleTimeout time.Duration + + // AbsoluteTimeout: maximum lifetime of a session regardless of + // activity. Default 8h. Wire: CERTCTL_SESSION_ABSOLUTE_TIMEOUT. + AbsoluteTimeout time.Duration + + // SigningKeyRetention: time a retired signing key stays valid for + // verification before being purged. Default 24h. Wire: + // CERTCTL_SESSION_SIGNING_KEY_RETENTION. + SigningKeyRetention time.Duration + + // BindIP: when true, Validate compares the request's client IP to + // the session row's recorded IP. Default false. Mobile + corporate- + // NAT environments leave this off. Wire: CERTCTL_SESSION_BIND_IP. + BindIP bool + + // BindUserAgent: when true, Validate compares the request's User- + // Agent to the session row's recorded UA. Default false. Wire: + // CERTCTL_SESSION_BIND_USER_AGENT. + BindUserAgent bool +} + +// DefaultConfig returns the Phase 4 defaults. cmd/server/main.go +// merges CERTCTL_SESSION_* env vars over these. +func DefaultConfig() Config { + return Config{ + IdleTimeout: 1 * time.Hour, + AbsoluteTimeout: 8 * time.Hour, + SigningKeyRetention: 24 * time.Hour, + BindIP: false, + BindUserAgent: false, + } +} + +// NewService constructs a session Service. +// +// encryptionKey is the CERTCTL_CONFIG_ENCRYPTION_KEY value used to +// decrypt SessionSigningKey.KeyMaterialEncrypted blobs. Required in +// production; tests may pass empty (the v3 blob path falls back via +// internal/crypto/encryption.go's plaintext-passthrough behavior when +// the blob is short-circuited via the test-only NewService variant — +// see service_test.go's helpers). +// +// audit may be nil in test setups that don't care about audit rows; +// production wires *service.AuditService from cmd/server/main.go. +func NewService( + sessions SessionRepo, + keys SigningKeyRepo, + audit AuditRecorder, + tenantID string, + cfg Config, + encryptionKey string, +) *Service { + return &Service{ + sessions: sessions, + keys: keys, + audit: audit, + tenantID: tenantID, + cfg: cfg, + encryption: encryptionKey, + clockNow: time.Now, + readRand: cryptorand.Read, + } +} + +// SetClockForTest replaces the clock used for expiry calculations. +// ONLY for tests; production reads time.Now via the default seam. +func (s *Service) SetClockForTest(now func() time.Time) { + s.clockNow = now +} + +// SetRandReaderForTest replaces the entropy source. ONLY for tests; +// production reads crypto/rand via the default seam. +func (s *Service) SetRandReaderForTest(r func([]byte) (int, error)) { + s.readRand = r +} + +// ============================================================================= +// Create + cookie minting. +// ============================================================================= + +// CreateResult is the post-login session payload. The handler sets +// the cookies + redirects. +type CreateResult struct { + Session *sessiondomain.Session + CookieValue string // certctl_session cookie body (`v1.ses-XX.sk-YY.HMAC`) + CSRFToken string // certctl_csrf cookie body (32 random bytes b64url) +} + +// Create mints a new post-login session row, signs the cookie value, +// and returns both the session-cookie payload and the CSRF token +// plaintext. The handler: +// - Sets `certctl_session` HttpOnly Secure SameSite=Lax(or Strict) Path=/ +// to CookieValue with Expires=session.AbsoluteExpiresAt. +// - Sets `certctl_csrf` Secure SameSite=Lax(or Strict) Path=/ HttpOnly=false +// to CSRFToken with Expires=session.AbsoluteExpiresAt. +func (s *Service) Create(ctx context.Context, actorID, actorType, ip, userAgent string) (*CreateResult, error) { + if strings.TrimSpace(actorID) == "" { + return nil, fmt.Errorf("session: actor_id is required") + } + if strings.TrimSpace(actorType) == "" { + return nil, fmt.Errorf("session: actor_type is required") + } + + active, err := s.keys.GetActive(ctx, s.tenantID) + if err != nil { + return nil, fmt.Errorf("session: get active signing key: %w", err) + } + hmacKey, err := decryptKeyMaterial(active.KeyMaterialEncrypted, s.encryption) + if err != nil { + return nil, fmt.Errorf("session: decrypt active key material: %w", err) + } + + sessionID, err := s.newOpaqueID("ses-") + if err != nil { + return nil, fmt.Errorf("session: generate session id: %w", err) + } + + csrfToken, err := s.newCSRFToken() + if err != nil { + return nil, fmt.Errorf("session: generate csrf token: %w", err) + } + + now := s.clockNow().UTC() + row := &sessiondomain.Session{ + ID: sessionID, + ActorID: actorID, + ActorType: actorType, + SigningKeyID: active.ID, + IsPreLogin: false, + CSRFTokenHash: hashCSRFToken(csrfToken), + IdleExpiresAt: now.Add(s.cfg.IdleTimeout), + AbsoluteExpiresAt: now.Add(s.cfg.AbsoluteTimeout), + CreatedAt: now, + LastSeenAt: now, + IPAddress: ip, + UserAgent: userAgent, + TenantID: s.tenantID, + } + if verr := row.Validate(); verr != nil { + return nil, fmt.Errorf("session: validate row: %w", verr) + } + if cerr := s.sessions.Create(ctx, row); cerr != nil { + return nil, fmt.Errorf("session: create row: %w", cerr) + } + + cookieValue := signCookie(row.ID, row.SigningKeyID, hmacKey) + + return &CreateResult{ + Session: row, + CookieValue: cookieValue, + CSRFToken: csrfToken, + }, nil +} + +// ============================================================================= +// Validate. +// ============================================================================= + +// ValidateInput bundles the data Validate needs from the HTTP request. +// The handler builds it from the session cookie, request IP, and +// User-Agent header. +type ValidateInput struct { + CookieValue string + ClientIP string + UserAgent string +} + +// Validate verifies the cookie's signature, looks up the session row, +// and enforces idle + absolute expiry, revocation, optional IP/UA +// binding. Returns the session on success; one of the package-scoped +// sentinels on failure. +// +// Note: Validate does NOT call UpdateLastSeen — the middleware does +// that explicitly so the test surface stays unambiguous about side +// effects under the read path. +func (s *Service) Validate(ctx context.Context, in ValidateInput) (*sessiondomain.Session, error) { + sessionID, signingKeyID, providedHMAC, err := parseCookie(in.CookieValue) + if err != nil { + return nil, ErrSessionInvalidCookie + } + + signingKey, err := s.keys.Get(ctx, signingKeyID) + if err != nil { + return nil, ErrSigningKeyNotFound + } + + now := s.clockNow().UTC() + + // Retired key still in retention window is OK; past retention is not. + if signingKey.RetiredAt != nil { + retentionExpiresAt := signingKey.RetiredAt.Add(s.cfg.SigningKeyRetention) + if now.After(retentionExpiresAt) { + return nil, ErrSigningKeyRetired + } + } + + hmacKey, err := decryptKeyMaterial(signingKey.KeyMaterialEncrypted, s.encryption) + if err != nil { + return nil, ErrSessionInvalidCookie + } + + expectedHMAC := computeHMAC(sessionID, signingKeyID, hmacKey) + if subtle.ConstantTimeCompare(expectedHMAC, providedHMAC) != 1 { + return nil, ErrSessionInvalidCookie + } + + row, err := s.sessions.Get(ctx, sessionID) + if err != nil { + return nil, ErrSessionInvalidCookie + } + + if row.RevokedAt != nil { + return nil, ErrSessionRevoked + } + + // Absolute expiry: hard cap regardless of activity. + if !now.Before(row.AbsoluteExpiresAt) { + return nil, ErrSessionExpiredAbsolute + } + + // Idle expiry: re-evaluated against last_seen_at + idle window. + idleDeadline := row.LastSeenAt.Add(s.cfg.IdleTimeout) + if !now.Before(idleDeadline) { + return nil, ErrSessionExpiredIdle + } + + // Optional defense-in-depth IP / UA binding. + if s.cfg.BindIP && in.ClientIP != "" && row.IPAddress != "" && in.ClientIP != row.IPAddress { + s.recordAudit(ctx, "auth.session_ip_mismatch", row.ActorID, domain.ActorType(row.ActorType), row.ID, + map[string]interface{}{"session_id": row.ID, "expected_ip": row.IPAddress, "request_ip": in.ClientIP}) + return nil, ErrSessionIPMismatch + } + if s.cfg.BindUserAgent && in.UserAgent != "" && row.UserAgent != "" && in.UserAgent != row.UserAgent { + s.recordAudit(ctx, "auth.session_ua_mismatch", row.ActorID, domain.ActorType(row.ActorType), row.ID, + map[string]interface{}{"session_id": row.ID}) + return nil, ErrSessionUAMismatch + } + + return row, nil +} + +// ValidateCSRF compares the SHA-256 of the X-CSRF-Token header against +// the session row's stored hash. Constant-time-compares to defeat +// timing attacks. Empty header → ErrCSRFMissing. +func (s *Service) ValidateCSRF(headerValue string, sess *sessiondomain.Session) error { + if strings.TrimSpace(headerValue) == "" { + return ErrCSRFMissing + } + if sess == nil || sess.CSRFTokenHash == "" { + return ErrCSRFMismatch + } + provided := hashCSRFToken(headerValue) + if subtle.ConstantTimeCompare([]byte(provided), []byte(sess.CSRFTokenHash)) != 1 { + return ErrCSRFMismatch + } + return nil +} + +// UpdateLastSeen advances the session's last_seen_at to now. Called by +// the middleware on every authenticated request to keep the idle-expiry +// sliding window fresh. +func (s *Service) UpdateLastSeen(ctx context.Context, sessionID string) error { + if err := s.sessions.UpdateLastSeen(ctx, sessionID); err != nil { + return fmt.Errorf("session: update_last_seen: %w", err) + } + return nil +} + +// ============================================================================= +// Revoke + RevokeAllForActor + RotateCSRFToken. +// ============================================================================= + +// Revoke sets revoked_at on the session row. Idempotent at the repo +// layer (re-revoking is a no-op). Subsequent Validate returns +// ErrSessionRevoked. +func (s *Service) Revoke(ctx context.Context, sessionID string) error { + if err := s.sessions.Revoke(ctx, sessionID); err != nil { + return fmt.Errorf("session: revoke: %w", err) + } + s.recordAudit(ctx, "auth.session_revoked", "system", domain.ActorTypeSystem, sessionID, + map[string]interface{}{"session_id": sessionID}) + return nil +} + +// RevokeAllForActor sets revoked_at on every active session for the +// (actorID, actorType, tenantID) tuple. Used on role change, fired- +// employee scenarios, and the back-channel logout endpoint (Phase 5). +func (s *Service) RevokeAllForActor(ctx context.Context, actorID, actorType string) error { + if err := s.sessions.RevokeAllForActor(ctx, actorID, actorType, s.tenantID); err != nil { + return fmt.Errorf("session: revoke_all_for_actor: %w", err) + } + s.recordAudit(ctx, "auth.sessions_revoked_for_actor", actorID, domain.ActorType(actorType), actorID, + map[string]interface{}{"actor_id": actorID, "actor_type": actorType}) + return nil +} + +// RotateCSRFToken mints a fresh CSRF token, persists its SHA-256 hash +// on the session row, and returns the plaintext for the handler to +// re-emit in the certctl_csrf cookie. Called on: +// +// - Login completion (Service.Create already mints a token; explicit +// rotation here is for follow-up calls). +// - Logout (defense-in-depth even though the session is revoked). +// - Any actor-role mutation against this actor. +// - Explicit operator-triggered "rotate CSRF" admin endpoint. +func (s *Service) RotateCSRFToken(ctx context.Context, sessionID string) (string, error) { + csrfToken, err := s.newCSRFToken() + if err != nil { + return "", fmt.Errorf("session: generate csrf token: %w", err) + } + hash := hashCSRFToken(csrfToken) + if uerr := s.sessions.UpdateCSRFTokenHash(ctx, sessionID, hash); uerr != nil { + return "", fmt.Errorf("session: update csrf hash: %w", uerr) + } + s.recordAudit(ctx, "auth.session_csrf_rotated", "system", domain.ActorTypeSystem, sessionID, + map[string]interface{}{"session_id": sessionID}) + return csrfToken, nil +} + +// ============================================================================= +// Signing-key lifecycle. +// ============================================================================= + +// RotateSigningKey mints a fresh 32-byte HMAC key, persists it as the +// new active key, and retires the previously-active key. The retired +// key stays valid for verification during cfg.SigningKeyRetention so +// existing cookies don't immediately fail; the GarbageCollect sweep +// purges it after the retention window passes (and after no sessions +// reference it). +func (s *Service) RotateSigningKey(ctx context.Context) error { + currentActive, err := s.keys.GetActive(ctx, s.tenantID) + if err != nil { + // No active key at all: this is a bootstrap-not-yet-run state; + // EnsureInitialSigningKey is the right entrypoint. + return fmt.Errorf("session: get active for rotate: %w", err) + } + + newID, err := s.newOpaqueID("sk-") + if err != nil { + return fmt.Errorf("session: generate signing key id: %w", err) + } + newPlaintext, err := s.newKeyMaterial() + if err != nil { + return fmt.Errorf("session: generate signing key material: %w", err) + } + newCiphertext, err := encryptKeyMaterial(newPlaintext, s.encryption) + if err != nil { + return fmt.Errorf("session: encrypt signing key material: %w", err) + } + + newKey := &sessiondomain.SessionSigningKey{ + ID: newID, + TenantID: s.tenantID, + KeyMaterialEncrypted: newCiphertext, + } + if verr := newKey.Validate(); verr != nil { + return fmt.Errorf("session: validate new key: %w", verr) + } + if aerr := s.keys.Add(ctx, newKey); aerr != nil { + return fmt.Errorf("session: add new signing key: %w", aerr) + } + + if rerr := s.keys.Retire(ctx, currentActive.ID); rerr != nil { + return fmt.Errorf("session: retire previous active key: %w", rerr) + } + + s.recordAudit(ctx, "auth.session_signing_key_rotated", "system", domain.ActorTypeSystem, newID, + map[string]interface{}{"new_key_id": newID, "retired_key_id": currentActive.ID}) + return nil +} + +// EnsureInitialSigningKey is idempotent: if a non-retired signing key +// exists for the tenant, it returns nil. Otherwise it mints a fresh +// 32-byte key, persists it, and emits an +// auth.session_signing_key_bootstrap audit row with event_category=auth. +// +// Production wires this into cmd/server/main.go startup AFTER +// migrations + RBAC backfill, BEFORE the HTTP listener binds. Failure +// is fatal — the server refuses to boot rather than serve session-less. +func (s *Service) EnsureInitialSigningKey(ctx context.Context) error { + _, err := s.keys.GetActive(ctx, s.tenantID) + if err == nil { + return nil // a key already exists; idempotent no-op. + } + + // Any error other than "not found" should bubble; the boot loader + // fails fatal regardless, but distinguishing repo-error from + // no-row-yet is useful in logs. + if !errors.Is(err, repository.ErrSessionSigningKeyNotFound) { + return fmt.Errorf("session: probe active signing key: %w", err) + } + + newID, err := s.newOpaqueID("sk-") + if err != nil { + return fmt.Errorf("%w: %v", ErrInitialSigningKeyMintFailed, err) + } + plaintext, err := s.newKeyMaterial() + if err != nil { + return fmt.Errorf("%w: %v", ErrInitialSigningKeyMintFailed, err) + } + ciphertext, err := encryptKeyMaterial(plaintext, s.encryption) + if err != nil { + return fmt.Errorf("%w: %v", ErrInitialSigningKeyMintFailed, err) + } + + k := &sessiondomain.SessionSigningKey{ + ID: newID, + TenantID: s.tenantID, + KeyMaterialEncrypted: ciphertext, + } + if verr := k.Validate(); verr != nil { + return fmt.Errorf("%w: validate: %v", ErrInitialSigningKeyMintFailed, verr) + } + if aerr := s.keys.Add(ctx, k); aerr != nil { + return fmt.Errorf("%w: persist: %v", ErrInitialSigningKeyMintFailed, aerr) + } + + s.recordAudit(ctx, "auth.session_signing_key_bootstrap", "system", domain.ActorTypeSystem, newID, + map[string]interface{}{"key_id": newID}) + return nil +} + +// ============================================================================= +// GarbageCollect. +// ============================================================================= + +// GarbageCollect runs one sweep: +// - Deletes sessions whose absolute_expires_at is in the past +// (post-login expired) AND pre-login rows older than 10 minutes +// (delegated to the repo's GarbageCollectExpired). +// - Deletes signing keys whose retired_at + retention window has +// passed AND that are not still referenced by sessions (the FK +// ON DELETE RESTRICT in the schema is the safety net; we attempt +// and ignore ErrSessionSigningKeyInUse). +// +// Wired into the scheduler's sessionGCLoop on a CERTCTL_SESSION_GC_INTERVAL +// tick (default 1h). Returns the count of session rows deleted. +func (s *Service) GarbageCollect(ctx context.Context) (int, error) { + deleted, err := s.sessions.GarbageCollectExpired(ctx) + if err != nil { + return 0, fmt.Errorf("session: gc expired sessions: %w", err) + } + + // Sweep retired-and-expired signing keys. Best-effort; in-use keys + // (FK reference) are skipped by the repo's ErrSessionSigningKeyInUse + // return. + keys, listErr := s.keys.List(ctx, s.tenantID) + if listErr != nil { + // Listing failed but we already deleted sessions; return the + // session count + the list error so the operator sees both. + return deleted, fmt.Errorf("session: gc list keys: %w", listErr) + } + now := s.clockNow().UTC() + for _, k := range keys { + if k.RetiredAt == nil { + continue + } + if !now.After(k.RetiredAt.Add(s.cfg.SigningKeyRetention)) { + continue + } + if derr := s.keys.Delete(ctx, k.ID); derr != nil { + // In-use keys (sessions still reference) are kept; any other + // error short-circuits to surface it. + if errors.Is(derr, repository.ErrSessionSigningKeyInUse) { + continue + } + return deleted, fmt.Errorf("session: gc delete signing key %s: %w", k.ID, derr) + } + } + return deleted, nil +} + +// ============================================================================= +// Helpers. +// ============================================================================= + +// signCookie returns the wire-format session cookie value: +// `v1...`. +func signCookie(sessionID, signingKeyID string, hmacKey []byte) string { + mac := computeHMAC(sessionID, signingKeyID, hmacKey) + return fmt.Sprintf("%s.%s.%s.%s", + sessiondomain.CookieFormatVersion, + sessionID, + signingKeyID, + base64.RawURLEncoding.EncodeToString(mac), + ) +} + +// computeHMAC returns the HMAC-SHA256 over the LENGTH-PREFIXED +// canonical input +// +// len(sessionID) || ":" || sessionID || ":" || len(signingKeyID) || ":" || signingKeyID +// +// where len(...) is the ASCII decimal byte-length. The length prefix +// is load-bearing: without it, `` and `` produce +// identical input and a forger could swap one byte across the boundary. +func computeHMAC(sessionID, signingKeyID string, hmacKey []byte) []byte { + mac := hmac.New(sha256.New, hmacKey) + mac.Write([]byte(strconv.Itoa(len(sessionID)))) + mac.Write([]byte(":")) + mac.Write([]byte(sessionID)) + mac.Write([]byte(":")) + mac.Write([]byte(strconv.Itoa(len(signingKeyID)))) + mac.Write([]byte(":")) + mac.Write([]byte(signingKeyID)) + return mac.Sum(nil) +} + +// parseCookie splits the wire format and returns the three identifying +// parts plus the decoded HMAC. Any format/version/decode failure +// returns an error; the caller maps to ErrSessionInvalidCookie without +// surfacing which check failed (no information leak). +func parseCookie(cookieValue string) (sessionID, signingKeyID string, hmacBytes []byte, err error) { + if cookieValue == "" { + return "", "", nil, errors.New("empty cookie") + } + parts := strings.Split(cookieValue, ".") + if len(parts) != 4 { + return "", "", nil, errors.New("expected 4 segments") + } + if parts[0] != sessiondomain.CookieFormatVersion { + return "", "", nil, errors.New("unsupported version prefix") + } + if !strings.HasPrefix(parts[1], "ses-") { + return "", "", nil, errors.New("session id missing prefix") + } + if !strings.HasPrefix(parts[2], "sk-") { + return "", "", nil, errors.New("signing key id missing prefix") + } + mac, derr := base64.RawURLEncoding.DecodeString(parts[3]) + if derr != nil { + return "", "", nil, fmt.Errorf("hmac base64: %w", derr) + } + if len(mac) != sha256.Size { + return "", "", nil, errors.New("hmac length") + } + return parts[1], parts[2], mac, nil +} + +// hashCSRFToken returns the lowercase-hex SHA-256 of the plaintext +// CSRF token. The session row stores this hash; the cookie holds the +// plaintext. +func hashCSRFToken(plaintext string) string { + h := sha256.Sum256([]byte(plaintext)) + return hex.EncodeToString(h[:]) +} + +// newOpaqueID returns prefix + base64url-no-pad of 16 random bytes. +// 128 bits of entropy is sufficient against guessing for both session +// ids and signing-key ids in any realistic deployment. +func (s *Service) newOpaqueID(prefix string) (string, error) { + b := make([]byte, 16) + if _, err := s.readRand(b); err != nil { + return "", err + } + return prefix + base64.RawURLEncoding.EncodeToString(b), nil +} + +// newCSRFToken returns base64url-no-pad of 32 random bytes (~256 bits +// of entropy). Plaintext goes in the certctl_csrf cookie; SHA-256 +// hash goes on the session row. +func (s *Service) newCSRFToken() (string, error) { + b := make([]byte, 32) + if _, err := s.readRand(b); err != nil { + return "", err + } + return base64.RawURLEncoding.EncodeToString(b), nil +} + +// newKeyMaterial returns 32 raw random bytes for use as an HMAC-SHA256 +// key. crypto/rand is the source. +func (s *Service) newKeyMaterial() ([]byte, error) { + b := make([]byte, 32) + if _, err := s.readRand(b); err != nil { + return nil, err + } + return b, nil +} + +// recordAudit is a thin wrapper around s.audit.RecordEventWithCategory +// that swallows audit-layer errors (the audit row is best-effort; a +// failed audit must not block a successful session operation). The +// Phase 8 contract is event_category=auth for everything in this +// service. +func (s *Service) recordAudit(ctx context.Context, action, actor string, actorType domain.ActorType, resourceID string, details map[string]interface{}) { + if s.audit == nil { + return + } + _ = s.audit.RecordEventWithCategory(ctx, actor, actorType, action, + "auth", "session", resourceID, details) +} diff --git a/internal/auth/session/service_test.go b/internal/auth/session/service_test.go new file mode 100644 index 0000000..ac1b6a6 --- /dev/null +++ b/internal/auth/session/service_test.go @@ -0,0 +1,1107 @@ +package session + +import ( + "context" + "crypto/sha256" + "encoding/base64" + "errors" + "fmt" + "strings" + "sync" + "testing" + "time" + + sessiondomain "github.com/certctl-io/certctl/internal/auth/session/domain" + "github.com/certctl-io/certctl/internal/domain" + "github.com/certctl-io/certctl/internal/repository" +) + +// ============================================================================= +// In-memory stubs for SessionRepo + SigningKeyRepo + AuditRecorder. +// +// These are deliberately tiny and test-only. The Phase 2 integration tests +// (under internal/repository/postgres/) cover the SQL layer; here we only +// care about the service-layer state machine. +// ============================================================================= + +type stubSessionRepo struct { + mu sync.Mutex + rows map[string]*sessiondomain.Session + createErr error + getErr error + updateLastErr error + updateCSRFErr error + revokeErr error + revokeAllErr error + gcErr error + gcCount int + gcCalls int +} + +func newStubSessionRepo() *stubSessionRepo { + return &stubSessionRepo{rows: make(map[string]*sessiondomain.Session)} +} + +func (r *stubSessionRepo) Create(_ context.Context, s *sessiondomain.Session) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.createErr != nil { + return r.createErr + } + clone := *s + r.rows[s.ID] = &clone + return nil +} + +func (r *stubSessionRepo) Get(_ context.Context, id string) (*sessiondomain.Session, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.getErr != nil { + return nil, r.getErr + } + row, ok := r.rows[id] + if !ok { + return nil, repository.ErrSessionNotFound + } + clone := *row + return &clone, nil +} + +func (r *stubSessionRepo) UpdateLastSeen(_ context.Context, id string) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.updateLastErr != nil { + return r.updateLastErr + } + row, ok := r.rows[id] + if !ok { + return repository.ErrSessionNotFound + } + row.LastSeenAt = time.Now().UTC() + return nil +} + +func (r *stubSessionRepo) UpdateCSRFTokenHash(_ context.Context, id, csrfTokenHash string) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.updateCSRFErr != nil { + return r.updateCSRFErr + } + row, ok := r.rows[id] + if !ok { + return repository.ErrSessionNotFound + } + row.CSRFTokenHash = csrfTokenHash + return nil +} + +func (r *stubSessionRepo) Revoke(_ context.Context, id string) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.revokeErr != nil { + return r.revokeErr + } + row, ok := r.rows[id] + if !ok { + return repository.ErrSessionNotFound + } + now := time.Now().UTC() + row.RevokedAt = &now + return nil +} + +func (r *stubSessionRepo) RevokeAllForActor(_ context.Context, actorID, actorType, _ string) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.revokeAllErr != nil { + return r.revokeAllErr + } + now := time.Now().UTC() + for _, row := range r.rows { + if row.ActorID == actorID && row.ActorType == actorType && row.RevokedAt == nil { + row.RevokedAt = &now + } + } + return nil +} + +func (r *stubSessionRepo) GarbageCollectExpired(_ context.Context) (int, error) { + r.mu.Lock() + defer r.mu.Unlock() + r.gcCalls++ + if r.gcErr != nil { + return 0, r.gcErr + } + return r.gcCount, nil +} + +type stubKeyRepo struct { + mu sync.Mutex + keys map[string]*sessiondomain.SessionSigningKey + addErr error + retireErr error + listErr error + deleteErr error + getErr error + getActErr error +} + +func newStubKeyRepo() *stubKeyRepo { + return &stubKeyRepo{keys: make(map[string]*sessiondomain.SessionSigningKey)} +} + +func (r *stubKeyRepo) GetActive(_ context.Context, tenantID string) (*sessiondomain.SessionSigningKey, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.getActErr != nil { + return nil, r.getActErr + } + var newest *sessiondomain.SessionSigningKey + for _, k := range r.keys { + if k.TenantID != tenantID || k.RetiredAt != nil { + continue + } + if newest == nil || k.CreatedAt.After(newest.CreatedAt) { + newest = k + } + } + if newest == nil { + return nil, repository.ErrSessionSigningKeyNotFound + } + clone := *newest + return &clone, nil +} + +func (r *stubKeyRepo) Get(_ context.Context, id string) (*sessiondomain.SessionSigningKey, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.getErr != nil { + return nil, r.getErr + } + k, ok := r.keys[id] + if !ok { + return nil, repository.ErrSessionSigningKeyNotFound + } + clone := *k + return &clone, nil +} + +func (r *stubKeyRepo) Add(_ context.Context, k *sessiondomain.SessionSigningKey) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.addErr != nil { + return r.addErr + } + if k.CreatedAt.IsZero() { + k.CreatedAt = time.Now().UTC() + } + clone := *k + r.keys[k.ID] = &clone + return nil +} + +func (r *stubKeyRepo) Retire(_ context.Context, id string) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.retireErr != nil { + return r.retireErr + } + k, ok := r.keys[id] + if !ok { + return repository.ErrSessionSigningKeyNotFound + } + if k.RetiredAt == nil { + now := time.Now().UTC() + k.RetiredAt = &now + } + return nil +} + +func (r *stubKeyRepo) List(_ context.Context, tenantID string) ([]*sessiondomain.SessionSigningKey, error) { + r.mu.Lock() + defer r.mu.Unlock() + if r.listErr != nil { + return nil, r.listErr + } + var out []*sessiondomain.SessionSigningKey + for _, k := range r.keys { + if k.TenantID == tenantID { + clone := *k + out = append(out, &clone) + } + } + return out, nil +} + +func (r *stubKeyRepo) Delete(_ context.Context, id string) error { + r.mu.Lock() + defer r.mu.Unlock() + if r.deleteErr != nil { + return r.deleteErr + } + if _, ok := r.keys[id]; !ok { + return repository.ErrSessionSigningKeyNotFound + } + delete(r.keys, id) + return nil +} + +type stubAudit struct { + mu sync.Mutex + events []recordedAuditEvent +} + +type recordedAuditEvent struct { + Actor string + Type domain.ActorType + Action string + Category string + Resource string + Details map[string]interface{} +} + +func (a *stubAudit) RecordEventWithCategory(_ context.Context, actor string, actorType domain.ActorType, action, category, _, resourceID string, details map[string]interface{}) error { + a.mu.Lock() + defer a.mu.Unlock() + a.events = append(a.events, recordedAuditEvent{ + Actor: actor, Type: actorType, Action: action, Category: category, + Resource: resourceID, Details: details, + }) + return nil +} + +func (a *stubAudit) actions() []string { + a.mu.Lock() + defer a.mu.Unlock() + out := make([]string, len(a.events)) + for i, e := range a.events { + out[i] = e.Action + } + return out +} + +// ============================================================================= +// Test helpers. +// ============================================================================= + +const testTenant = "t-default" + +// newTestService returns a fully wired service (in-memory stubs) with a +// pre-seeded active signing key. encryptionKey is empty so the key blob +// is plaintext — sufficient for service-layer tests; the +// real-encryption round-trip lives in TestService_EncryptionRoundTrip. +func newTestService(t *testing.T, cfg Config) (*Service, *stubSessionRepo, *stubKeyRepo, *stubAudit, string) { + t.Helper() + sessions := newStubSessionRepo() + keys := newStubKeyRepo() + audit := &stubAudit{} + svc := NewService(sessions, keys, audit, testTenant, cfg, "") + if err := svc.EnsureInitialSigningKey(context.Background()); err != nil { + t.Fatalf("EnsureInitialSigningKey: %v", err) + } + // Find the just-minted key id for tests that need it. + var keyID string + for id := range keys.keys { + keyID = id + } + return svc, sessions, keys, audit, keyID +} + +func defaultCfg() Config { + return Config{ + IdleTimeout: 1 * time.Hour, + AbsoluteTimeout: 8 * time.Hour, + SigningKeyRetention: 24 * time.Hour, + } +} + +// ============================================================================= +// Happy paths. +// ============================================================================= + +func TestService_Create_HappyPath(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, err := svc.Create(context.Background(), "u-alice", "User", "10.0.0.1", "Mozilla") + if err != nil { + t.Fatalf("Create: %v", err) + } + if res.Session.ID == "" || !strings.HasPrefix(res.Session.ID, "ses-") { + t.Errorf("session id missing or wrong prefix: %q", res.Session.ID) + } + if !strings.HasPrefix(res.CookieValue, "v1.") { + t.Errorf("cookie missing v1. prefix: %q", res.CookieValue) + } + if res.CSRFToken == "" { + t.Errorf("csrf token empty") + } + // Session row stored with hashed CSRF (not plaintext). + stored, _ := sessions.Get(context.Background(), res.Session.ID) + if stored.CSRFTokenHash == res.CSRFToken { + t.Errorf("CSRFTokenHash equals plaintext (must be SHA-256 hash)") + } + if hashCSRFToken(res.CSRFToken) != stored.CSRFTokenHash { + t.Errorf("CSRFTokenHash != SHA-256(plaintext)") + } +} + +func TestService_Validate_HappyPath_RoundTrip(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, err := svc.Create(context.Background(), "u-bob", "User", "10.0.0.2", "Firefox") + if err != nil { + t.Fatalf("Create: %v", err) + } + got, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue, ClientIP: "10.0.0.2", UserAgent: "Firefox"}) + if err != nil { + t.Fatalf("Validate: %v", err) + } + if got.ID != res.Session.ID { + t.Errorf("validated session id mismatch: got %s, want %s", got.ID, res.Session.ID) + } +} + +func TestService_ValidateCSRF_HappyPath(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-eve", "User", "", "") + if err := svc.ValidateCSRF(res.CSRFToken, res.Session); err != nil { + t.Errorf("ValidateCSRF (correct token): %v", err) + } +} + +func TestService_UpdateLastSeen_HappyPath(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-mike", "User", "", "") + original := sessions.rows[res.Session.ID].LastSeenAt + time.Sleep(2 * time.Millisecond) + if err := svc.UpdateLastSeen(context.Background(), res.Session.ID); err != nil { + t.Fatalf("UpdateLastSeen: %v", err) + } + if !sessions.rows[res.Session.ID].LastSeenAt.After(original) { + t.Errorf("LastSeenAt did not advance") + } +} + +// ============================================================================= +// Phase 4 spec — 15 negative cases. +// ============================================================================= + +// #1: Tampered cookie segment fails signature check. +// +// Note: we flip a byte NEAR THE START of the HMAC segment, not at the +// end. base64url-no-pad's trailing character carries only 2 bits of +// "real" data (43 chars * 6 bits = 258 bits but the SHA-256 output is +// 256 bits, so the bottom 2 bits of the last char are discarded by the +// decoder). Flipping the last char can decode to the same byte string +// even though the cookie text differs — which would make the test +// flaky against the production HMAC compare. Flipping near the start +// guarantees the decoded HMAC differs. +func TestService_Validate_TamperedCookieRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-tamper", "User", "", "") + parts := strings.Split(res.CookieValue, ".") + if len(parts[3]) < 4 { + t.Fatalf("hmac segment too short to tamper: %q", parts[3]) + } + // Flip char at index 1 of the HMAC segment to a value whose top 6 + // bits guaranteed-differ. 'A'<->'_' is a max-distance pair in + // base64url's alphabet. + pivot := byte('A') + if parts[3][1] == 'A' { + pivot = byte('_') + } + tamperedHMAC := []byte(parts[3]) + tamperedHMAC[1] = pivot + parts[3] = string(tamperedHMAC) + tampered := strings.Join(parts, ".") + if tampered == res.CookieValue { + t.Fatalf("tamper produced byte-identical cookie; test setup broken") + } + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: tampered}) + if !errors.Is(err, ErrSessionInvalidCookie) { + t.Errorf("err = %v; want ErrSessionInvalidCookie", err) + } +} + +// #1b: Tampered SESSION_ID segment also fails. +func TestService_Validate_TamperedSessionIDRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-tamper2", "User", "", "") + parts := strings.Split(res.CookieValue, ".") + // Replace session id segment with a different (but well-formed) id; + // signature verification fails because HMAC was computed over the + // original session id. + parts[1] = "ses-DIFFERENT0000000000000000000" + tampered := strings.Join(parts, ".") + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: tampered}) + if !errors.Is(err, ErrSessionInvalidCookie) { + t.Errorf("err = %v; want ErrSessionInvalidCookie", err) + } +} + +// #2: Cookie missing the v1. version prefix is rejected. +func TestService_Validate_MissingVersionPrefixRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-noprefix", "User", "", "") + parts := strings.SplitN(res.CookieValue, ".", 2) + bad := parts[1] // strip the "v1." prefix + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: bad}) + if !errors.Is(err, ErrSessionInvalidCookie) { + t.Errorf("err = %v; want ErrSessionInvalidCookie", err) + } +} + +// #3: Unknown version prefix rejected — no fallback attempt. +func TestService_Validate_UnknownVersionPrefixRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-vbad", "User", "", "") + bad := "v99" + res.CookieValue[2:] // replace v1 with v99 + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: bad}) + if !errors.Is(err, ErrSessionInvalidCookie) { + t.Errorf("err = %v; want ErrSessionInvalidCookie", err) + } +} + +// #4: Idle expiry returns ErrSessionExpiredIdle. +func TestService_Validate_ExpiredIdleRejected(t *testing.T) { + cfg := defaultCfg() + cfg.IdleTimeout = 1 * time.Millisecond + svc, sessions, _, _, _ := newTestService(t, cfg) + res, _ := svc.Create(context.Background(), "u-idle", "User", "", "") + // Reach into the row and back-date last_seen_at to defeat the idle window. + row := sessions.rows[res.Session.ID] + row.LastSeenAt = time.Now().UTC().Add(-1 * time.Hour) + row.IdleExpiresAt = time.Now().UTC().Add(-1 * time.Minute) + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if !errors.Is(err, ErrSessionExpiredIdle) { + t.Errorf("err = %v; want ErrSessionExpiredIdle", err) + } +} + +// #5: Absolute expiry returns ErrSessionExpiredAbsolute. +func TestService_Validate_ExpiredAbsoluteRejected(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-abs", "User", "", "") + row := sessions.rows[res.Session.ID] + row.AbsoluteExpiresAt = time.Now().UTC().Add(-1 * time.Hour) + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if !errors.Is(err, ErrSessionExpiredAbsolute) { + t.Errorf("err = %v; want ErrSessionExpiredAbsolute", err) + } +} + +// #6: Revoked session returns ErrSessionRevoked. +func TestService_Validate_RevokedRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-rev", "User", "", "") + if err := svc.Revoke(context.Background(), res.Session.ID); err != nil { + t.Fatalf("Revoke: %v", err) + } + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if !errors.Is(err, ErrSessionRevoked) { + t.Errorf("err = %v; want ErrSessionRevoked", err) + } +} + +// #7: Cookie with a signing-key id that doesn't match any row -> ErrSigningKeyNotFound. +func TestService_Validate_WrongSigningKeyRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-wkey", "User", "", "") + parts := strings.Split(res.CookieValue, ".") + parts[2] = "sk-NONEXISTENT00000000000000000" + bad := strings.Join(parts, ".") + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: bad}) + if !errors.Is(err, ErrSigningKeyNotFound) { + t.Errorf("err = %v; want ErrSigningKeyNotFound", err) + } +} + +// #8: Cookie signed under a retired-but-in-retention key SUCCEEDS. +func TestService_Validate_RetiredButInRetentionAccepted(t *testing.T) { + svc, _, keys, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-ret", "User", "", "") + + // Mint a NEW active key; the previously-active key gets retired. + if err := svc.RotateSigningKey(context.Background()); err != nil { + t.Fatalf("RotateSigningKey: %v", err) + } + + // Confirm retired_at was set on the original key. + parts := strings.Split(res.CookieValue, ".") + old := keys.keys[parts[2]] + if old.RetiredAt == nil { + t.Fatalf("expected old key to be retired; RetiredAt is nil") + } + + // Cookie signed under the now-retired key still validates because it's + // inside the retention window. + got, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if err != nil { + t.Fatalf("Validate (retired-in-retention): %v", err) + } + if got.ID != res.Session.ID { + t.Errorf("session id mismatch") + } +} + +// #9: Cookie signed under a fully-purged-past-retention key FAILS. +func TestService_Validate_RetiredPastRetentionRejected(t *testing.T) { + cfg := defaultCfg() + cfg.SigningKeyRetention = 100 * time.Millisecond + svc, _, keys, _, _ := newTestService(t, cfg) + res, _ := svc.Create(context.Background(), "u-purg", "User", "", "") + + if err := svc.RotateSigningKey(context.Background()); err != nil { + t.Fatalf("RotateSigningKey: %v", err) + } + // Back-date retired_at to push the key past the retention window. + parts := strings.Split(res.CookieValue, ".") + old := keys.keys[parts[2]] + pastT := time.Now().UTC().Add(-1 * time.Hour) + old.RetiredAt = &pastT + + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if !errors.Is(err, ErrSigningKeyRetired) { + t.Errorf("err = %v; want ErrSigningKeyRetired", err) + } +} + +// #10: Concatenation-collision attempt — the length-prefixed HMAC input +// MUST defeat `` claiming authority for ``. This test forges +// a cookie whose `` SUMS to the same byte sequence +// as the legitimate cookie's pair but slides the boundary by one character. +// Without the length prefix in computeHMAC the two would HMAC-collide; with +// the prefix they don't. +func TestService_Validate_ConcatenationCollisionDefeatedByLengthPrefix(t *testing.T) { + // Build the legitimate cookie under (sid="ses-ABC", kid="sk-XYZ"). + hmacKey := bytes32("test-key") + legit := signCookie("ses-ABC", "sk-XYZ", hmacKey) + + // Build the forged variant that slides the boundary one char to the + // right: (sid="ses-ABCs", kid="k-XYZ"). Same byte sequence pre-prefix; + // different lengths. + forgedRaw := signCookie("ses-ABCs", "k-XYZ", hmacKey) + forgedParts := strings.Split(forgedRaw, ".") + legitParts := strings.Split(legit, ".") + + // Direct evidence: the two HMACs MUST differ. + if forgedParts[3] == legitParts[3] { + t.Errorf("HMACs collided across boundary slide — length prefix is broken") + } + + // And: a cookie that uses the legit sid + kid + the FORGED hmac is + // rejected by parseCookie/HMAC-recompute path (the two segments + // of interest hash to different values). + forgedSwap := legitParts[0] + "." + legitParts[1] + "." + legitParts[2] + "." + forgedParts[3] + if forgedSwap == legit { + t.Fatalf("forged cookie is byte-identical to legit; concat-collision test setup broken") + } +} + +// #11: CSRF token missing on POST -> 403. +func TestService_ValidateCSRF_MissingHeaderRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-csrf1", "User", "", "") + if err := svc.ValidateCSRF("", res.Session); !errors.Is(err, ErrCSRFMissing) { + t.Errorf("err = %v; want ErrCSRFMissing", err) + } +} + +// #12: CSRF token mismatch -> 403; constant-time compare. +func TestService_ValidateCSRF_MismatchRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-csrf2", "User", "", "") + if err := svc.ValidateCSRF("a-totally-different-token", res.Session); !errors.Is(err, ErrCSRFMismatch) { + t.Errorf("err = %v; want ErrCSRFMismatch", err) + } +} + +// #13: IP-bind enabled + IP changed -> ErrSessionIPMismatch. +func TestService_Validate_IPBindMismatchRejected(t *testing.T) { + cfg := defaultCfg() + cfg.BindIP = true + svc, _, _, audit, _ := newTestService(t, cfg) + res, _ := svc.Create(context.Background(), "u-ipbind", "User", "10.0.0.1", "Firefox") + _, err := svc.Validate(context.Background(), ValidateInput{ + CookieValue: res.CookieValue, ClientIP: "10.0.0.99", UserAgent: "Firefox", + }) + if !errors.Is(err, ErrSessionIPMismatch) { + t.Errorf("err = %v; want ErrSessionIPMismatch", err) + } + if !contains(audit.actions(), "auth.session_ip_mismatch") { + t.Errorf("expected audit row auth.session_ip_mismatch; got %v", audit.actions()) + } +} + +// #14: UA-bind enabled + UA changed -> ErrSessionUAMismatch. +func TestService_Validate_UABindMismatchRejected(t *testing.T) { + cfg := defaultCfg() + cfg.BindUserAgent = true + svc, _, _, audit, _ := newTestService(t, cfg) + res, _ := svc.Create(context.Background(), "u-uabind", "User", "10.0.0.1", "Firefox") + _, err := svc.Validate(context.Background(), ValidateInput{ + CookieValue: res.CookieValue, ClientIP: "10.0.0.1", UserAgent: "Chrome", + }) + if !errors.Is(err, ErrSessionUAMismatch) { + t.Errorf("err = %v; want ErrSessionUAMismatch", err) + } + if !contains(audit.actions(), "auth.session_ua_mismatch") { + t.Errorf("expected audit row auth.session_ua_mismatch; got %v", audit.actions()) + } +} + +// #15: Initial-key bootstrap failure (RNG returns error) -> EnsureInitialSigningKey +// returns ErrInitialSigningKeyMintFailed; cmd/server/main.go wraps this as +// log.Fatal at boot. +func TestService_EnsureInitialSigningKey_RNGFailureSurfacesAsFatalSentinel(t *testing.T) { + sessions := newStubSessionRepo() + keys := newStubKeyRepo() + svc := NewService(sessions, keys, nil, testTenant, defaultCfg(), "") + svc.SetRandReaderForTest(func(_ []byte) (int, error) { + return 0, fmt.Errorf("simulated entropy starvation") + }) + err := svc.EnsureInitialSigningKey(context.Background()) + if !errors.Is(err, ErrInitialSigningKeyMintFailed) { + t.Errorf("err = %v; want wrap of ErrInitialSigningKeyMintFailed", err) + } +} + +// ============================================================================= +// Coverage-lift batch — branches not exercised by the 15-case matrix. +// ============================================================================= + +func TestService_Create_RejectsEmptyActorID(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + if _, err := svc.Create(context.Background(), "", "User", "", ""); err == nil { + t.Errorf("expected error on empty actor_id") + } + if _, err := svc.Create(context.Background(), "u-x", "", "", ""); err == nil { + t.Errorf("expected error on empty actor_type") + } +} + +func TestService_Create_GetActiveError(t *testing.T) { + sessions := newStubSessionRepo() + keys := newStubKeyRepo() + keys.getActErr = fmt.Errorf("simulated db error") + svc := NewService(sessions, keys, nil, testTenant, defaultCfg(), "") + if _, err := svc.Create(context.Background(), "u-x", "User", "", ""); err == nil { + t.Errorf("expected error on get-active failure") + } +} + +func TestService_Create_SessionRepoCreateError(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + sessions.createErr = fmt.Errorf("simulated db error") + if _, err := svc.Create(context.Background(), "u-x", "User", "", ""); err == nil { + t.Errorf("expected error on session-repo create failure") + } +} + +func TestService_Create_RNGFailureBubbles(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + svc.SetRandReaderForTest(func(_ []byte) (int, error) { + return 0, fmt.Errorf("simulated rng exhaustion") + }) + if _, err := svc.Create(context.Background(), "u-x", "User", "", ""); err == nil { + t.Errorf("expected RNG failure to surface") + } +} + +func TestService_RotateCSRFToken_HappyPath(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-rot", "User", "", "") + originalHash := sessions.rows[res.Session.ID].CSRFTokenHash + + newToken, err := svc.RotateCSRFToken(context.Background(), res.Session.ID) + if err != nil { + t.Fatalf("RotateCSRFToken: %v", err) + } + if newToken == res.CSRFToken { + t.Errorf("rotated token equals original (RNG broken)") + } + if sessions.rows[res.Session.ID].CSRFTokenHash == originalHash { + t.Errorf("session row hash didn't update after rotation") + } +} + +func TestService_RotateCSRFToken_UpdateError(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-rot2", "User", "", "") + sessions.updateCSRFErr = fmt.Errorf("simulated db error") + if _, err := svc.RotateCSRFToken(context.Background(), res.Session.ID); err == nil { + t.Errorf("expected error on UpdateCSRFTokenHash failure") + } +} + +func TestService_RevokeAllForActor_HappyPath(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res1, _ := svc.Create(context.Background(), "u-multi", "User", "", "") + res2, _ := svc.Create(context.Background(), "u-multi", "User", "", "") + if err := svc.RevokeAllForActor(context.Background(), "u-multi", "User"); err != nil { + t.Fatalf("RevokeAllForActor: %v", err) + } + if sessions.rows[res1.Session.ID].RevokedAt == nil { + t.Errorf("session 1 not revoked") + } + if sessions.rows[res2.Session.ID].RevokedAt == nil { + t.Errorf("session 2 not revoked") + } +} + +func TestService_RotateSigningKey_RetiresOldAndAddsNew(t *testing.T) { + svc, _, keys, _, oldID := newTestService(t, defaultCfg()) + if err := svc.RotateSigningKey(context.Background()); err != nil { + t.Fatalf("RotateSigningKey: %v", err) + } + old, _ := keys.Get(context.Background(), oldID) + if old.RetiredAt == nil { + t.Errorf("old key not retired") + } + active, _ := keys.GetActive(context.Background(), testTenant) + if active.ID == oldID { + t.Errorf("active key did not change") + } +} + +func TestService_EnsureInitialSigningKey_IdempotentOnExisting(t *testing.T) { + svc, _, keys, _, oldID := newTestService(t, defaultCfg()) + // Second call must be a no-op. + if err := svc.EnsureInitialSigningKey(context.Background()); err != nil { + t.Fatalf("EnsureInitialSigningKey (second call): %v", err) + } + all, _ := keys.List(context.Background(), testTenant) + if len(all) != 1 { + t.Errorf("expected idempotent (1 key); got %d", len(all)) + } + if all[0].ID != oldID { + t.Errorf("key id changed across idempotent calls") + } +} + +func TestService_EnsureInitialSigningKey_GetActiveErrorOtherThanNotFoundBubbles(t *testing.T) { + sessions := newStubSessionRepo() + keys := newStubKeyRepo() + keys.getActErr = fmt.Errorf("simulated db error other than not-found") + svc := NewService(sessions, keys, nil, testTenant, defaultCfg(), "") + if err := svc.EnsureInitialSigningKey(context.Background()); err == nil { + t.Errorf("expected non-nil error from non-NotFound get-active") + } +} + +func TestService_EnsureInitialSigningKey_AddErrorWraps(t *testing.T) { + sessions := newStubSessionRepo() + keys := newStubKeyRepo() + keys.addErr = fmt.Errorf("simulated insert failure") + svc := NewService(sessions, keys, nil, testTenant, defaultCfg(), "") + err := svc.EnsureInitialSigningKey(context.Background()) + if !errors.Is(err, ErrInitialSigningKeyMintFailed) { + t.Errorf("err = %v; want wrap of ErrInitialSigningKeyMintFailed", err) + } +} + +func TestService_GarbageCollect_HappyPath(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + sessions.gcCount = 7 + deleted, err := svc.GarbageCollect(context.Background()) + if err != nil { + t.Fatalf("GarbageCollect: %v", err) + } + if deleted != 7 { + t.Errorf("deleted = %d; want 7", deleted) + } +} + +func TestService_GarbageCollect_PurgesRetiredPastRetention(t *testing.T) { + cfg := defaultCfg() + cfg.SigningKeyRetention = 1 * time.Millisecond + svc, _, keys, _, oldID := newTestService(t, cfg) + if err := svc.RotateSigningKey(context.Background()); err != nil { + t.Fatalf("RotateSigningKey: %v", err) + } + // Back-date the retired_at so the GC sweep purges it. + pastT := time.Now().UTC().Add(-1 * time.Hour) + keys.keys[oldID].RetiredAt = &pastT + if _, err := svc.GarbageCollect(context.Background()); err != nil { + t.Fatalf("GarbageCollect: %v", err) + } + if _, err := keys.Get(context.Background(), oldID); !errors.Is(err, repository.ErrSessionSigningKeyNotFound) { + t.Errorf("old key still present after GC") + } +} + +func TestService_GarbageCollect_KeysListErrorPropagated(t *testing.T) { + svc, _, keys, _, _ := newTestService(t, defaultCfg()) + keys.listErr = fmt.Errorf("simulated list error") + if _, err := svc.GarbageCollect(context.Background()); err == nil { + t.Errorf("expected error on keys.List failure") + } +} + +func TestService_GarbageCollect_KeyInUseSkipped(t *testing.T) { + cfg := defaultCfg() + cfg.SigningKeyRetention = 1 * time.Millisecond + svc, _, keys, _, oldID := newTestService(t, cfg) + _ = svc.RotateSigningKey(context.Background()) + pastT := time.Now().UTC().Add(-1 * time.Hour) + keys.keys[oldID].RetiredAt = &pastT + keys.deleteErr = repository.ErrSessionSigningKeyInUse + if _, err := svc.GarbageCollect(context.Background()); err != nil { + t.Fatalf("GarbageCollect (in-use should be silently skipped): %v", err) + } +} + +func TestService_GarbageCollect_KeyDeleteOtherErrorBubbles(t *testing.T) { + cfg := defaultCfg() + cfg.SigningKeyRetention = 1 * time.Millisecond + svc, _, keys, _, oldID := newTestService(t, cfg) + _ = svc.RotateSigningKey(context.Background()) + pastT := time.Now().UTC().Add(-1 * time.Hour) + keys.keys[oldID].RetiredAt = &pastT + keys.deleteErr = fmt.Errorf("some other db error") + if _, err := svc.GarbageCollect(context.Background()); err == nil { + t.Errorf("expected error to bubble from non-InUse delete failure") + } +} + +func TestService_GarbageCollect_SessionRepoErrorBubbles(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + sessions.gcErr = fmt.Errorf("simulated session-gc failure") + if _, err := svc.GarbageCollect(context.Background()); err == nil { + t.Errorf("expected error to bubble from session-repo gc failure") + } +} + +func TestService_RotateSigningKey_GetActiveError(t *testing.T) { + svc, _, keys, _, _ := newTestService(t, defaultCfg()) + keys.getActErr = fmt.Errorf("simulated error") + if err := svc.RotateSigningKey(context.Background()); err == nil { + t.Errorf("expected error when getActive fails") + } +} + +func TestService_RotateSigningKey_AddError(t *testing.T) { + svc, _, keys, _, _ := newTestService(t, defaultCfg()) + keys.addErr = fmt.Errorf("simulated insert failure") + if err := svc.RotateSigningKey(context.Background()); err == nil { + t.Errorf("expected error when add fails") + } +} + +func TestService_RotateSigningKey_RetireError(t *testing.T) { + svc, _, keys, _, _ := newTestService(t, defaultCfg()) + keys.retireErr = fmt.Errorf("simulated retire failure") + if err := svc.RotateSigningKey(context.Background()); err == nil { + t.Errorf("expected error when retire fails") + } +} + +func TestService_Validate_SessionGetErrorMappedToInvalidCookie(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-y", "User", "", "") + sessions.getErr = fmt.Errorf("simulated session.Get failure") + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if !errors.Is(err, ErrSessionInvalidCookie) { + t.Errorf("err = %v; want ErrSessionInvalidCookie", err) + } +} + +func TestService_UpdateLastSeen_RepoErrorWraps(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-uls", "User", "", "") + sessions.updateLastErr = fmt.Errorf("simulated db error") + if err := svc.UpdateLastSeen(context.Background(), res.Session.ID); err == nil { + t.Errorf("expected error on UpdateLastSeen failure") + } +} + +func TestService_Revoke_RepoErrorWraps(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-rev2", "User", "", "") + sessions.revokeErr = fmt.Errorf("simulated db error") + if err := svc.Revoke(context.Background(), res.Session.ID); err == nil { + t.Errorf("expected error on Revoke failure") + } +} + +func TestService_RevokeAllForActor_RepoErrorWraps(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + sessions.revokeAllErr = fmt.Errorf("simulated db error") + if err := svc.RevokeAllForActor(context.Background(), "u-x", "User"); err == nil { + t.Errorf("expected error on RevokeAllForActor failure") + } +} + +func TestService_ValidateCSRF_NilSessionRejected(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + if err := svc.ValidateCSRF("anything", nil); !errors.Is(err, ErrCSRFMismatch) { + t.Errorf("err = %v; want ErrCSRFMismatch", err) + } +} + +func TestService_SetClockForTest_OverridesNow(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + frozen := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC) + svc.SetClockForTest(func() time.Time { return frozen }) + if got := svc.clockNow(); !got.Equal(frozen) { + t.Errorf("clock = %v; want %v", got, frozen) + } +} + +func TestService_DefaultConfig_HasPromptDefaults(t *testing.T) { + cfg := DefaultConfig() + if cfg.IdleTimeout != 1*time.Hour { + t.Errorf("IdleTimeout = %v; want 1h", cfg.IdleTimeout) + } + if cfg.AbsoluteTimeout != 8*time.Hour { + t.Errorf("AbsoluteTimeout = %v; want 8h", cfg.AbsoluteTimeout) + } + if cfg.SigningKeyRetention != 24*time.Hour { + t.Errorf("SigningKeyRetention = %v; want 24h", cfg.SigningKeyRetention) + } + if cfg.BindIP || cfg.BindUserAgent { + t.Errorf("Bind* defaults should be false; got IP=%v UA=%v", cfg.BindIP, cfg.BindUserAgent) + } +} + +func TestService_RotateCSRFToken_RNGFailureBubbles(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-rotrng", "User", "", "") + svc.SetRandReaderForTest(func(_ []byte) (int, error) { + return 0, fmt.Errorf("rng dead") + }) + if _, err := svc.RotateCSRFToken(context.Background(), res.Session.ID); err == nil { + t.Errorf("expected RNG-failure to surface from RotateCSRFToken") + } +} + +func TestService_RotateSigningKey_RNGFailureBubbles(t *testing.T) { + svc, _, _, _, _ := newTestService(t, defaultCfg()) + svc.SetRandReaderForTest(func(_ []byte) (int, error) { + return 0, fmt.Errorf("rng dead") + }) + if err := svc.RotateSigningKey(context.Background()); err == nil { + t.Errorf("expected RNG-failure to surface from RotateSigningKey") + } +} + +func TestService_Validate_DecryptKeyMaterialFailure(t *testing.T) { + // With a real encryption passphrase, an external mutation of the + // key blob causes Decrypt to fail; Validate maps to ErrSessionInvalidCookie. + const passphrase = "test-passphrase-decrypt-fail" + sessions := newStubSessionRepo() + keys := newStubKeyRepo() + svc := NewService(sessions, keys, nil, testTenant, defaultCfg(), passphrase) + if err := svc.EnsureInitialSigningKey(context.Background()); err != nil { + t.Fatalf("EnsureInitialSigningKey: %v", err) + } + res, _ := svc.Create(context.Background(), "u-decfail", "User", "", "") + // Corrupt the stored ciphertext. + for _, k := range keys.keys { + k.KeyMaterialEncrypted = append([]byte("corrupt-prefix"), k.KeyMaterialEncrypted...) + } + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if !errors.Is(err, ErrSessionInvalidCookie) { + t.Errorf("err = %v; want ErrSessionInvalidCookie", err) + } +} + +// ============================================================================= +// HMAC-input length-prefix correctness — direct unit test of computeHMAC. +// +// Without the length prefix, computeHMAC for ("abc","de") would equal +// computeHMAC for ("ab","cde"). With the prefix, it must not. +// ============================================================================= + +func TestComputeHMAC_LengthPrefixDefeatsConcatCollision(t *testing.T) { + key := bytes32("the-key") + a := computeHMAC("abc", "de", key) + b := computeHMAC("ab", "cde", key) + if base64.RawURLEncoding.EncodeToString(a) == base64.RawURLEncoding.EncodeToString(b) { + t.Errorf("computeHMAC(\"abc\",\"de\") == computeHMAC(\"ab\",\"cde\") — length prefix is broken") + } +} + +// ============================================================================= +// Encryption round-trip: sign + validate against a real CERTCTL_CONFIG_ENCRYPTION_KEY. +// ============================================================================= + +func TestService_EncryptionRoundTrip(t *testing.T) { + const passphrase = "test-encryption-passphrase-12345" + sessions := newStubSessionRepo() + keys := newStubKeyRepo() + svc := NewService(sessions, keys, nil, testTenant, defaultCfg(), passphrase) + if err := svc.EnsureInitialSigningKey(context.Background()); err != nil { + t.Fatalf("EnsureInitialSigningKey: %v", err) + } + res, err := svc.Create(context.Background(), "u-enc", "User", "", "") + if err != nil { + t.Fatalf("Create: %v", err) + } + got, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if err != nil { + t.Fatalf("Validate (real-encryption round trip): %v", err) + } + if got.ID != res.Session.ID { + t.Errorf("session id mismatch") + } +} + +// ============================================================================= +// Cookie parser unit tests. +// ============================================================================= + +func TestParseCookie_RejectsEmpty(t *testing.T) { + if _, _, _, err := parseCookie(""); err == nil { + t.Errorf("expected error for empty cookie") + } +} + +func TestParseCookie_RejectsWrongSegmentCount(t *testing.T) { + for _, bad := range []string{"v1", "v1.ses-x", "v1.ses-x.sk-y", "v1.ses-x.sk-y.h.extra"} { + if _, _, _, err := parseCookie(bad); err == nil { + t.Errorf("expected error for bad segment count: %q", bad) + } + } +} + +func TestParseCookie_RejectsMissingPrefixes(t *testing.T) { + mac := base64.RawURLEncoding.EncodeToString(make([]byte, sha256.Size)) + if _, _, _, err := parseCookie("v1.bad-id.sk-y." + mac); err == nil { + t.Errorf("expected error for session id missing prefix") + } + if _, _, _, err := parseCookie("v1.ses-x.bad-key." + mac); err == nil { + t.Errorf("expected error for signing key id missing prefix") + } +} + +func TestParseCookie_RejectsBadBase64(t *testing.T) { + if _, _, _, err := parseCookie("v1.ses-x.sk-y.!!!notbase64"); err == nil { + t.Errorf("expected error for bad base64 hmac segment") + } +} + +func TestParseCookie_RejectsWrongHMACLength(t *testing.T) { + short := base64.RawURLEncoding.EncodeToString([]byte("not-32-bytes")) + if _, _, _, err := parseCookie("v1.ses-x.sk-y." + short); err == nil { + t.Errorf("expected error for wrong-length hmac") + } +} + +// ============================================================================= +// Test helpers. +// ============================================================================= + +// bytes32 returns 32 bytes deterministically derived from seed (for HMAC-key +// material in unit tests). Production keys come from crypto/rand. +func bytes32(seed string) []byte { + h := sha256.Sum256([]byte(seed)) + return h[:] +} + +func contains(s []string, v string) bool { + for _, x := range s { + if x == v { + return true + } + } + return false +} diff --git a/internal/config/config.go b/internal/config/config.go index 4cccb16..9ee3c70 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -1589,6 +1589,13 @@ type AuthConfig struct { // Setting: CERTCTL_AGENT_BOOTSTRAP_TOKEN environment variable. AgentBootstrapToken string + // Session holds the Auth Bundle 2 Phase 4 session-service tunables. + // Defaults are documented on the SessionConfig fields. The session + // service is wired into cmd/server/main.go alongside the OIDC + // service in Phase 5; pre-Phase-5 deployments that run with the + // legacy `api-key` auth type ignore this struct entirely. + Session SessionConfig + // BootstrapToken is the one-shot pre-shared secret that gates the // Bundle 1 Phase 6 bootstrap endpoint (POST /v1/auth/bootstrap). When // set at server startup AND no admin-roled actors exist, the @@ -1609,6 +1616,56 @@ type AuthConfig struct { BootstrapToken string } +// SessionConfig contains the Auth Bundle 2 Phase 4 session-service +// tunables. Every field is operator-overridable via the documented +// CERTCTL_SESSION_* env var; defaults are the conservative values from +// the Phase 4 spec. +// +// Bundle 2 Phase 4 / OWASP ASVS V3 (Session Management). The defaults +// (1h idle / 8h absolute / 24h key retention / 1h GC / Lax cookies / +// no IP-or-UA bind) are the conservative starting point that matches +// the prompt; tightening to Strict + IP/UA bind suits high-security +// environments at the cost of breaking inbound deep-links from external +// apps and login-from-mobile-on-cellular flows. +type SessionConfig struct { + // IdleTimeout: maximum time between authenticated requests on a + // session before re-auth is required. Default 1h. Wire: + // CERTCTL_SESSION_IDLE_TIMEOUT. + IdleTimeout time.Duration + + // AbsoluteTimeout: maximum lifetime of a session regardless of + // activity. Default 8h. Wire: CERTCTL_SESSION_ABSOLUTE_TIMEOUT. + AbsoluteTimeout time.Duration + + // SigningKeyRetention: time a retired signing key stays valid for + // verification before being purged from the keys table. Default + // 24h. Wire: CERTCTL_SESSION_SIGNING_KEY_RETENTION. + SigningKeyRetention time.Duration + + // GCInterval: scheduler tick interval for the session-GC sweep. + // Default 1h. Wire: CERTCTL_SESSION_GC_INTERVAL. + GCInterval time.Duration + + // SameSite: SameSite cookie attribute. Valid values: "Lax" + // (default) or "Strict". Strict is recommended for high-security + // environments at the cost of breaking inbound deep-links from + // external apps. Wire: CERTCTL_SESSION_SAMESITE. + SameSite string + + // BindIP: when true, the session middleware compares the request's + // client IP to the session row's recorded IP on every Validate. + // Mismatch -> 401, audit row, session NOT auto-revoked (user may + // have legitimate IP change). Default false. Wire: + // CERTCTL_SESSION_BIND_IP. + BindIP bool + + // BindUserAgent: when true, the session middleware compares the + // request's User-Agent to the session row's recorded UA on every + // Validate. Default false; useful only in tightly-controlled + // environments. Wire: CERTCTL_SESSION_BIND_USER_AGENT. + BindUserAgent bool +} + // RateLimitConfig contains rate limiting configuration. // // Bundle B / Audit M-025 (OWASP ASVS L2 §11.2.1): pre-bundle the rate @@ -1732,6 +1789,18 @@ func Load() (*Config, error) { // /v1/auth/bootstrap endpoint that mints the first admin // key. Empty = bootstrap endpoint disabled (default). BootstrapToken: getEnv("CERTCTL_BOOTSTRAP_TOKEN", ""), + // Bundle 2 Phase 4: session-service tunables. Defaults match + // the prompt; high-security deployments tighten via the env + // vars documented on SessionConfig fields. + Session: SessionConfig{ + IdleTimeout: getEnvDuration("CERTCTL_SESSION_IDLE_TIMEOUT", 1*time.Hour), + AbsoluteTimeout: getEnvDuration("CERTCTL_SESSION_ABSOLUTE_TIMEOUT", 8*time.Hour), + SigningKeyRetention: getEnvDuration("CERTCTL_SESSION_SIGNING_KEY_RETENTION", 24*time.Hour), + GCInterval: getEnvDuration("CERTCTL_SESSION_GC_INTERVAL", 1*time.Hour), + SameSite: getEnv("CERTCTL_SESSION_SAMESITE", "Lax"), + BindIP: getEnvBool("CERTCTL_SESSION_BIND_IP", false), + BindUserAgent: getEnvBool("CERTCTL_SESSION_BIND_USER_AGENT", false), + }, }, RateLimit: RateLimitConfig{ Enabled: getEnvBool("CERTCTL_RATE_LIMIT_ENABLED", true), diff --git a/internal/repository/postgres/session.go b/internal/repository/postgres/session.go index c6dd503..03b99fb 100644 --- a/internal/repository/postgres/session.go +++ b/internal/repository/postgres/session.go @@ -129,6 +129,21 @@ func (r *SessionRepository) UpdateLastSeen(ctx context.Context, id string) error return nil } +// UpdateCSRFTokenHash replaces csrf_token_hash on the named session. +// Phase 4's RotateCSRFToken consumes this on login completion, logout, +// and any actor-role mutation against this actor. +func (r *SessionRepository) UpdateCSRFTokenHash(ctx context.Context, id, csrfTokenHash string) error { + res, err := r.db.ExecContext(ctx, `UPDATE sessions SET csrf_token_hash = $2 WHERE id = $1`, id, csrfTokenHash) + if err != nil { + return fmt.Errorf("sessions update_csrf_token_hash: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + return repository.ErrSessionNotFound + } + return nil +} + // Revoke sets revoked_at = NOW() for the named session. Idempotent: // re-revoking an already-revoked session is a no-op (returns nil). func (r *SessionRepository) Revoke(ctx context.Context, id string) error { diff --git a/internal/repository/session.go b/internal/repository/session.go index c15533c..75d4156 100644 --- a/internal/repository/session.go +++ b/internal/repository/session.go @@ -61,6 +61,12 @@ type SessionRepository interface { // idle-expiry sliding window fresh. UpdateLastSeen(ctx context.Context, id string) error + // UpdateCSRFTokenHash replaces the csrf_token_hash on the session + // row. Phase 4's RotateCSRFToken consumes this on login completion, + // logout, and any actor-role mutation against this actor. The hash + // is the SHA-256 hex of the operator-facing CSRF token plaintext. + UpdateCSRFTokenHash(ctx context.Context, id, csrfTokenHash string) error + // Revoke sets revoked_at = NOW() for the named session. Subsequent // Get returns the row with RevokedAt set; Phase 4's Validate maps // to 401. diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 015a233..9239aaa 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -84,6 +84,14 @@ type ACMEGarbageCollector interface { GarbageCollect(ctx context.Context) error } +// SessionGarbageCollector is the interface the scheduler's sessionGCLoop +// invokes once per CERTCTL_SESSION_GC_INTERVAL tick. Concrete impl is +// *session.Service. Sweeps expired post-login + pre-login session rows +// AND retired-past-retention signing-key rows. Auth Bundle 2 Phase 4. +type SessionGarbageCollector interface { + GarbageCollect(ctx context.Context) (int, error) +} + // JobReaperService defines the interface for job timeout reaping used by the scheduler. type JobReaperService interface { ReapTimedOutJobs(ctx context.Context, csrTTL, approvalTTL time.Duration) error @@ -109,6 +117,7 @@ type Scheduler struct { cloudDiscoveryService CloudDiscoveryServicer crlCacheService CRLCacheServicer acmeGC ACMEGarbageCollector + sessionGC SessionGarbageCollector jobReaper JobReaperService logger *slog.Logger @@ -127,6 +136,7 @@ type Scheduler struct { crlGenerationInterval time.Duration jobTimeoutInterval time.Duration acmeGCInterval time.Duration + sessionGCInterval time.Duration // agentOfflineJobTTL: per-tick threshold for reaping Running jobs whose // owning agent has been silent. Bundle C / Audit M-016. Defaults below. agentOfflineJobTTL time.Duration @@ -148,6 +158,7 @@ type Scheduler struct { crlGenerationRunning atomic.Bool jobTimeoutRunning atomic.Bool acmeGCRunning atomic.Bool + sessionGCRunning atomic.Bool // Graceful shutdown: wait for in-flight work to complete wg sync.WaitGroup @@ -185,6 +196,7 @@ func NewScheduler( crlGenerationInterval: 1 * time.Hour, jobTimeoutInterval: 10 * time.Minute, acmeGCInterval: 1 * time.Minute, + sessionGCInterval: 1 * time.Hour, // 5 minutes is 5×agentHealthCheckInterval default of 1m; an agent // must miss multiple heartbeats before its in-flight jobs are reaped. agentOfflineJobTTL: 5 * time.Minute, @@ -317,6 +329,23 @@ func (s *Scheduler) SetACMEGCInterval(d time.Duration) { s.acmeGCInterval = d } +// SetSessionGarbageCollector wires the Auth Bundle 2 Phase 4 session GC +// service. Optional; nil disables the loop (Bundle-2-disabled deployments +// still run pre-Phase-4 behavior). +func (s *Scheduler) SetSessionGarbageCollector(gc SessionGarbageCollector) { + s.sessionGC = gc +} + +// SetSessionGCInterval configures the interval at which the session GC +// sweep runs. Default 1h. Wire: CERTCTL_SESSION_GC_INTERVAL. Zero or +// negative values are ignored. +func (s *Scheduler) SetSessionGCInterval(d time.Duration) { + if d <= 0 { + return + } + s.sessionGCInterval = d +} + // SetAgentOfflineJobTTL sets the threshold past which a Running job whose // owning agent has gone silent is reaped to Failed. Bundle C / Audit M-016. // Zero or negative values are ignored (the default of 5 minutes is kept). @@ -375,6 +404,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} { if s.acmeGC != nil { loopCount++ } + if s.sessionGC != nil { + loopCount++ + } s.wg.Add(loopCount) go func() { defer s.wg.Done(); s.renewalCheckLoop(ctx) }() @@ -403,6 +435,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} { if s.acmeGC != nil { go func() { defer s.wg.Done(); s.acmeGCLoop(ctx) }() } + if s.sessionGC != nil { + go func() { defer s.wg.Done(); s.sessionGCLoop(ctx) }() + } // Signal that all loops are launched close(startedChan) @@ -1146,3 +1181,40 @@ func (s *Scheduler) acmeGCLoop(ctx context.Context) { } } } + +// sessionGCLoop runs every sessionGCInterval and invokes +// SessionGarbageCollector.GarbageCollect, which sweeps: +// - sessions whose absolute_expires_at is in the past (post-login expired); +// - pre-login session rows older than 10 minutes; +// - retired-past-retention session_signing_keys rows. +// +// Auth Bundle 2 Phase 4. The atomic.Bool guard + the per-tick +// context.WithTimeout match the pattern of every other loop in this +// file: a stuck Postgres can't block the next tick, and concurrent +// sweeps are skipped not queued. +func (s *Scheduler) sessionGCLoop(ctx context.Context) { + ticker := time.NewTicker(s.sessionGCInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if !s.sessionGCRunning.CompareAndSwap(false, true) { + s.logger.Warn("session GC sweep still running, skipping tick") + continue + } + s.wg.Add(1) + go func() { + defer s.wg.Done() + defer s.sessionGCRunning.Store(false) + opCtx, cancel := context.WithTimeout(ctx, time.Minute) + defer cancel() + if _, err := s.sessionGC.GarbageCollect(opCtx); err != nil { + s.logger.Warn("session gc sweep failed (next tick will retry)", "error", err) + } + }() + } + } +}