diff --git a/internal/auth/oidc/service.go b/internal/auth/oidc/service.go index d0387e7..16f78f5 100644 --- a/internal/auth/oidc/service.go +++ b/internal/auth/oidc/service.go @@ -908,6 +908,19 @@ func atHashMatches(rawIDToken, accessToken, claimAtHash string) bool { // error talking to the IdP's jwks_uri during a key rotation event). // Maps to ErrJWKSUnreachable so the handler returns 503 to the // in-flight login attempt without auto-revoking existing sessions. +// +// Audit 2026-05-10 Nit-2 — pinned against go-oidc/v3 v3.18.0. As of +// that release, the only typed error exposed by the oidc package is +// `*oidc.TokenExpiredError`; JWKS-fetch failures bubble up as +// fmt.Errorf-wrapped strings from internal/keyset.go's `verify` path +// (`failed to verify signature: fetching keys: ...`, +// `oidc: fetching keys ...`, `oidc: failed to get keys for kid ...`). +// The regression test in service_test.go::TestIsJWKSFetchError_GoOIDCV318Strings +// pins the canonical substrings; a future go-oidc bump that changes +// the wording trips the test and forces this function to be re-derived. +// When go-oidc exposes a typed error (track at +// https://github.com/coreos/go-oidc/issues for the upstream RFE), +// switch to errors.As. func isJWKSFetchError(err error) bool { if err == nil { return false @@ -915,7 +928,13 @@ func isJWKSFetchError(err error) bool { msg := err.Error() return strings.Contains(msg, "fetching keys") || strings.Contains(msg, "jwks_uri") || - strings.Contains(msg, "key set") + strings.Contains(msg, "key set") || + // go-oidc/v3 v3.18.0 jwks.go:260: `oidc: failed to decode keys` + // — emitted when the IdP returns non-JSON at the jwks_uri + // (broken proxy, gateway HTML error page, etc.). Audit + // 2026-05-10 Nit-2 closure — was previously misclassified as + // a generic 500 instead of 503 ErrJWKSUnreachable. + strings.Contains(msg, "decode keys") } // decryptClientSecret runs the client_secret_encrypted blob through diff --git a/internal/auth/oidc/service_test.go b/internal/auth/oidc/service_test.go index dee01ce..7d3df46 100644 --- a/internal/auth/oidc/service_test.go +++ b/internal/auth/oidc/service_test.go @@ -1071,6 +1071,41 @@ func TestService_IsJWKSFetchError(t *testing.T) { } } +// TestIsJWKSFetchError_GoOIDCV318Strings pins the canonical go-oidc/v3 +// v3.18.0 error wordings against isJWKSFetchError. Audit 2026-05-10 +// Nit-2: go-oidc's only typed error as of v3.18.0 is +// *oidc.TokenExpiredError; JWKS-fetch failures bubble up as +// fmt.Errorf-wrapped strings. A future go-oidc bump that changes +// these strings will trip this test and force isJWKSFetchError to be +// re-derived (or, ideally, switched to errors.As against a newly- +// exposed typed error). Without this pin, a silent upstream string +// change would make every JWKS-rotation login surface as 500 instead +// of 503 — the operator-distinguishable wire shape promised by +// ErrJWKSUnreachable. +func TestIsJWKSFetchError_GoOIDCV318Strings(t *testing.T) { + // Canonical substrings observed in go-oidc/v3 v3.18.0 verify path. + // Sources (all under github.com/coreos/go-oidc/v3@v3.18.0/oidc/): + // - jwks.go:175 → fmt.Errorf("fetching keys %w", err) + // - jwks.go:260 → fmt.Errorf("oidc: failed to decode keys: %v %s", ...) + // Also stably matched by isJWKSFetchError's "jwks_uri" + "key set" + // fallbacks (substrings inside go-oidc-emitted strings and our + // own /api/v1/auth/oidc/.../refresh wrap errors). + canonical := []string{ + // Direct go-oidc v3.18.0 fmt.Errorf outputs. + "fetching keys: dial tcp: lookup idp.example.com: no such host", + "oidc: failed to decode keys: invalid character 'h' looking for beginning of value", + // Wrap from our own RefreshKeys / verify retry path. + "failed to refresh remote key set: timeout", + "unable to load key set: cancelled", + } + for _, msg := range canonical { + if !isJWKSFetchError(errors.New(msg)) { + t.Errorf("canonical go-oidc v3.18.0 string %q not detected as JWKS-fetch error; "+ + "update isJWKSFetchError or pin the new substring", msg) + } + } +} + // TestService_DecryptClientSecret_NoKeyReturnsBytesAsIs covers the // empty-key short-circuit (used by tests with plaintext blobs). func TestService_DecryptClientSecret_NoKeyReturnsBytesAsIs(t *testing.T) { diff --git a/internal/auth/session/middleware.go b/internal/auth/session/middleware.go index ae017f0..7b92c7f 100644 --- a/internal/auth/session/middleware.go +++ b/internal/auth/session/middleware.go @@ -90,6 +90,20 @@ func NewSessionMiddleware(svc SessionValidator) func(http.Handler) http.Handler UserAgent: r.UserAgent(), }) if verr != nil { + // Audit 2026-05-10 LOW-6 closure — ErrSessionTransient + // means the backend hit a retryable error (DB hiccup, + // connection reset, etc.) rather than the cookie being + // malformed. Surface 503 + Retry-After so well-behaved + // clients (curl --retry, browser fetch automatic retry, + // MCP clients) retry instead of forcing the user to + // re-auth on a transient issue. Pre-fix, every DB error + // looked like a forged-cookie 401. + if errors.Is(verr, ErrSessionTransient) { + w.Header().Set("Retry-After", "1") + w.Header().Set("Content-Type", "application/json; charset=utf-8") + http.Error(w, `{"error":"transient backend error; retry"}`, http.StatusServiceUnavailable) + return + } // Cookie present but invalid (expired / tampered / // retired-key / IP-bind / UA-bind / revoked). Defer to // the next middleware so a valid Bearer can still diff --git a/internal/auth/session/middleware_test.go b/internal/auth/session/middleware_test.go index 96935b1..3b82192 100644 --- a/internal/auth/session/middleware_test.go +++ b/internal/auth/session/middleware_test.go @@ -338,6 +338,26 @@ func TestClientIPFromRequest_Variants(t *testing.T) { } } +// TestSessionMiddleware_TransientErrorMappedTo503 pins the LOW-6 +// closure (audit 2026-05-10): when Validate returns +// ErrSessionTransient, the middleware MUST emit 503 with Retry-After +// instead of falling through to the Bearer/401 path. Pre-fix, a DB +// hiccup looked like a forged-cookie 401 + forced re-auth. +func TestSessionMiddleware_TransientErrorMappedTo503(t *testing.T) { + stub := &stubSessionValidator{validateErr: ErrSessionTransient} + chain := ChainAuthSessionThenBearer(NewSessionMiddleware(stub), nil)(markAuthenticated()) + req := httptest.NewRequest(http.MethodGet, "/x", nil) + req.AddCookie(&http.Cookie{Name: sessiondomain.PostLoginCookieName, Value: "v1.ses.sk.bad"}) + w := httptest.NewRecorder() + chain.ServeHTTP(w, req) + if w.Code != http.StatusServiceUnavailable { + t.Errorf("status = %d; want 503", w.Code) + } + if w.Header().Get("Retry-After") != "1" { + t.Errorf("Retry-After = %q; want 1", w.Header().Get("Retry-After")) + } +} + func TestChainAuthSessionThenBearer_NilBearer_Session401Path(t *testing.T) { stub := &stubSessionValidator{validateErr: ErrSessionInvalidCookie} chain := ChainAuthSessionThenBearer(NewSessionMiddleware(stub), nil)(markAuthenticated()) diff --git a/internal/auth/session/service.go b/internal/auth/session/service.go index 2707cf1..5958e26 100644 --- a/internal/auth/session/service.go +++ b/internal/auth/session/service.go @@ -153,6 +153,18 @@ var ( // auto-revoked (user may have legitimate IP change). ErrSessionIPMismatch = errors.New("session: client IP does not match session-bound IP") + // ErrSessionTransient: a non-deterministic, retryable failure (DB + // connection reset, network blip on the audit-row write inside + // the validate path, etc.). Distinct from ErrSessionInvalidCookie: + // the cookie itself isn't malformed/forged, the backend just + // failed to look it up cleanly. The middleware maps this to HTTP + // 503 with `Retry-After: 1` so well-behaved clients retry instead + // of forcing the user to re-authenticate. Audit 2026-05-10 LOW-6 + // closure — pre-fix, transient DB failures collapsed into + // ErrSessionInvalidCookie + 401, falsely framing a database outage + // as "your cookie is bad." + ErrSessionTransient = errors.New("session: transient backend error") + // ErrSessionUAMismatch: same shape as ErrSessionIPMismatch for the // optional CERTCTL_SESSION_BIND_USER_AGENT gate. ErrSessionUAMismatch = errors.New("session: User-Agent does not match session-bound User-Agent") @@ -453,7 +465,16 @@ func (s *Service) Validate(ctx context.Context, in ValidateInput) (*sessiondomai row, err := s.sessions.Get(ctx, sessionID) if err != nil { - return nil, ErrSessionInvalidCookie + // Audit 2026-05-10 LOW-6 closure — distinguish "this cookie's + // session row doesn't exist" (invalid: 401) from "the DB call + // failed transiently" (retryable: 503). Pre-fix, both + // collapsed into ErrSessionInvalidCookie, so a DB hiccup + // looked like a forged cookie in the audit log + forced the + // user to re-auth. + if errors.Is(err, repository.ErrSessionNotFound) { + return nil, ErrSessionInvalidCookie + } + return nil, fmt.Errorf("%w: %v", ErrSessionTransient, err) } if row.RevokedAt != nil { diff --git a/internal/auth/session/service_test.go b/internal/auth/session/service_test.go index adf3f98..139d051 100644 --- a/internal/auth/session/service_test.go +++ b/internal/auth/session/service_test.go @@ -921,14 +921,40 @@ func TestService_RotateSigningKey_RetireError(t *testing.T) { } } -func TestService_Validate_SessionGetErrorMappedToInvalidCookie(t *testing.T) { +// TestService_Validate_TransientSessionGetError pins the LOW-6 +// closure (audit 2026-05-10): a non-deterministic DB error from +// session.Get bubbles up as ErrSessionTransient (→ 503), NOT +// ErrSessionInvalidCookie (→ 401). The middleware test pins the +// 503-with-Retry-After wire shape; this one pins the service-layer +// sentinel. +func TestService_Validate_TransientSessionGetError(t *testing.T) { svc, sessions, _, _, _ := newTestService(t, defaultCfg()) res, _ := svc.Create(context.Background(), "u-y", "User", "", "") sessions.getErr = fmt.Errorf("simulated session.Get failure") _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) + if !errors.Is(err, ErrSessionTransient) { + t.Errorf("err = %v; want ErrSessionTransient", err) + } + if errors.Is(err, ErrSessionInvalidCookie) { + t.Errorf("err also matched ErrSessionInvalidCookie; want only ErrSessionTransient") + } +} + +// TestService_Validate_SessionNotFoundMapsToInvalidCookie pins the +// other half of the LOW-6 split: repository.ErrSessionNotFound (a +// real, deterministic "the row doesn't exist" answer from the DB) +// stays mapped to ErrSessionInvalidCookie (→ 401), NOT 503. +func TestService_Validate_SessionNotFoundMapsToInvalidCookie(t *testing.T) { + svc, sessions, _, _, _ := newTestService(t, defaultCfg()) + res, _ := svc.Create(context.Background(), "u-y2", "User", "", "") + sessions.getErr = repository.ErrSessionNotFound + _, err := svc.Validate(context.Background(), ValidateInput{CookieValue: res.CookieValue}) if !errors.Is(err, ErrSessionInvalidCookie) { t.Errorf("err = %v; want ErrSessionInvalidCookie", err) } + if errors.Is(err, ErrSessionTransient) { + t.Errorf("err also matched ErrSessionTransient; want only ErrSessionInvalidCookie") + } } func TestService_UpdateLastSeen_RepoErrorWraps(t *testing.T) {