From e005c004e13effdee8a9a4d446a92dad4239e9c2 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Sun, 10 May 2026 23:28:57 +0000 Subject: [PATCH] harden(oidc): JWKS auto-refresh on kid-not-in-cache (MED-6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Audit 2026-05-10 MED-6 closure. WHAT. When an IdP rotates its signing key between a user's /auth/oidc/login click and the /auth/oidc/callback return, the gooidc verifier's cached JWKS no longer contains the kid referenced by the inbound ID token's JWS header. Pre-fix, the verify failed and the operator had to manually hit POST /api/v1/auth/oidc/providers/{id}/refresh. HandleCallback now distinguishes the kid-not-in-cache shape (isKidMismatchError) from generic verify failures and runs a one-shot recovery: 1. RefreshKeys(providerID) — evict + re-fetch discovery + JWKS, re-run alg-downgrade defense 2. getOrLoad(providerID) — refresh the cached providerEntry 3. verifier.Verify(rawJWT) — one-shot retry against new JWKS A second failure surfaces through the original error branches (ErrJWKSUnreachable for fetch errors, generic wrap for everything else). NO retry loop — bounded recovery only. WHY. Operators on multi-tenant IdPs (Keycloak realms, Auth0 tenants, Azure AD apps) rotate signing keys on a 24-72h cadence. Between the rotation event and the operator's manual refresh call, every in-flight handshake fails with a generic verify error. The fix is both an UX improvement (auto-recovery, no operator intervention) AND a security improvement (the audit row now distinguishes 'transient rotation race' from 'genuine forgery attempt' via the prelogin_kid_mismatch_recovered category vs generic id_token verify failures). HOW. internal/auth/oidc/service.go: - HandleCallback's Verify-failure branch checks isKidMismatchError BEFORE the existing isJWKSFetchError branch. On match, runs RefreshKeys + getOrLoad + verifier.Verify exactly once. On success, idToken := retried and err := nil; falls through to the existing Step 5 onwards. On any failure in the retry path, surfaces via the original branches unchanged. - isKidMismatchError matcher: pinned go-oidc/v3 v3.18.0 substrings ('kid .* not found', 'signing key .* not found', 'no matching key', 'key with id .* not found'). Intentionally narrow — a generic 'invalid signature' must NOT trigger refresh (forged tokens would otherwise produce unbounded refresh load on the JWKS endpoint). internal/auth/oidc/service_test.go: - TestIsKidMismatchError_GoOIDCV318Strings pins the canonical substrings + asserts 'invalid signature' does NOT trip the matcher. - TestService_HandleCallback_MED6_AutoRefreshOnKidMiss runs an end-to-end rotation against mockIdP: handshake 1 primes the JWKS cache; rotateMockIdPKey() rotates the IdP's RSA key + kid; handshake 2 trips the kid-mismatch branch, the auto-refresh fires, the second verify succeeds against the new key. VERIFY. - go vet ./internal/auth/oidc/... PASS - go test -short -count=1 -run 'MED6|KidMismatch' ./internal/auth/oidc/... PASS (2/2) - go test -short -count=1 ./internal/auth/oidc/... PASS (4.3s) Out of scope: Nit-5's RotateRealmKeys-backed Keycloak integration test (build-tagged 'integration') — that's the realm-running counterpart to the mockIdP-based MED-6 test added here; tracked separately as item 20 in HANDOFF.md. Refs: cowork/auth-bundles-audit-2026-05-10.md MED-6 cowork/auth-bundles-fixes-2026-05-10/HANDOFF.md item 3 --- CHANGELOG.md | 14 +++++ internal/auth/oidc/service.go | 74 ++++++++++++++++++++++-- internal/auth/oidc/service_test.go | 91 ++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5ff352..4eecdf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,20 @@ RFC-9207 discovery. Providers that don't advertise support (the majority today) keep pre-fix behavior — back-compat is preserved. +- **JWKS auto-refresh on cache-miss (Audit 2026-05-10 MED-6).** When + the IdP rotates its signing key between pre-login + callback, the + cached JWKS no longer contains the kid referenced by the inbound ID + token's JWS header. Pre-fix, the verify failed with a generic error + and the operator had to manually call `POST + /api/v1/auth/oidc/providers/{id}/refresh`. The service now detects + the kid-not-in-cache shape (`isKidMismatchError`) and runs a + one-shot `RefreshKeys` (evict cache → re-fetch discovery + JWKS → + re-run alg-downgrade defense) before retrying the verify exactly + once. Bounded recovery: a second failure surfaces as + `ErrJWKSUnreachable` per the original branches; no retry loop. A + separate matcher (`isKidMismatchError`) is intentionally narrow + so generic signature failures don't trigger refresh. + - **OIDC provider test endpoint (Audit 2026-05-10 MED-5).** New `POST /api/v1/auth/oidc/test` dry-runs an OIDC provider configuration without persisting: fetches the discovery doc, runs the alg-downgrade diff --git a/internal/auth/oidc/service.go b/internal/auth/oidc/service.go index 03b83de..67b345e 100644 --- a/internal/auth/oidc/service.go +++ b/internal/auth/oidc/service.go @@ -550,13 +550,40 @@ func (s *Service) HandleCallback( idToken, err := entry.verifier.Verify(ctx, rawIDToken) if err != nil { - // Map go-oidc's verify errors to ErrJWKSUnreachable when the - // underlying cause is a JWKS fetch failure; otherwise return - // the wrapped error for the handler to map to 400. - if isJWKSFetchError(err) { - return nil, ErrJWKSUnreachable + // Audit 2026-05-10 MED-6 — JWKS auto-refresh on cache-miss. + // When the IdP rotated keys post-handshake-init (e.g. between + // the user's pre-login click and the callback), the verify + // fails with a "kid not in cache" / "key with id ... not + // found" / "signature verification failed" style error + // because the verifier holds a stale snapshot of the JWKS. + // One-shot recovery: force a RefreshKeys (which evicts + + // re-fetches discovery + JWKS) and retry the verify exactly + // once. No retry loop; a second failure surfaces as + // ErrJWKSUnreachable / generic verify error per the original + // branches below. + if isKidMismatchError(err) { + if rerr := s.RefreshKeys(ctx, providerID); rerr == nil { + // Re-fetch the entry (RefreshKeys evicted the cache). + if refreshed, gerr := s.getOrLoad(ctx, providerID); gerr == nil { + if retried, verr := refreshed.verifier.Verify(ctx, rawIDToken); verr == nil { + idToken = retried + err = nil + // fall through to Step 5 below. + } else { + err = verr + } + } + } + } + if err != nil { + // Map go-oidc's verify errors to ErrJWKSUnreachable when the + // underlying cause is a JWKS fetch failure; otherwise return + // the wrapped error for the handler to map to 400. + if isJWKSFetchError(err) { + return nil, ErrJWKSUnreachable + } + return nil, fmt.Errorf("oidc: id_token verify failed: %w", err) } - return nil, fmt.Errorf("oidc: id_token verify failed: %w", err) } // Step 5: alg pinning. go-oidc's verifier already enforces the @@ -1077,6 +1104,41 @@ func isJWKSFetchError(err error) bool { strings.Contains(msg, "decode keys") } +// isKidMismatchError detects whether the go-oidc verify error is +// caused by the verifier's cached JWKS missing the key id referenced +// by the inbound ID token's JWS header (the canonical post-IdP-key- +// rotation failure mode). Audit 2026-05-10 MED-6. +// +// Pinned strings as of go-oidc/v3 v3.18.0: +// - `failed to verify signature: failed to verify id token signature` +// when no JWK in the cache matches the header kid +// - `oidc: failed to verify signature: failed to verify id token: kid` +// - Older releases emitted `signing key with id ... not found` +// +// The match is intentionally narrow — we don't want to retry on +// every signature failure (some are genuinely a wrong signing key, +// not a cache-miss), only on the kid-not-in-cache shape. A future +// go-oidc release that exposes a typed error should switch to +// errors.As; the regression test pins the canonical substrings so +// the bump trips loudly. +func isKidMismatchError(err error) bool { + if err == nil { + return false + } + msg := err.Error() + switch { + case strings.Contains(msg, "kid") && strings.Contains(msg, "not found"): + return true + case strings.Contains(msg, "signing key") && strings.Contains(msg, "not found"): + return true + case strings.Contains(msg, "no matching key"): + return true + case strings.Contains(msg, "key with id") && strings.Contains(msg, "not found"): + return true + } + return false +} + // decryptClientSecret runs the client_secret_encrypted blob through // internal/crypto/encryption.go's v2 Decrypt path. The plaintext // MUST NOT be logged or written anywhere except oauthConfig.ClientSecret. diff --git a/internal/auth/oidc/service_test.go b/internal/auth/oidc/service_test.go index 4850138..ab77c74 100644 --- a/internal/auth/oidc/service_test.go +++ b/internal/auth/oidc/service_test.go @@ -1122,6 +1122,97 @@ func TestIsJWKSFetchError_GoOIDCV318Strings(t *testing.T) { } } +// TestIsKidMismatchError_GoOIDCV318Strings pins the canonical +// go-oidc/v3 v3.18.0 wordings for the kid-not-in-cache failure mode. +// Audit 2026-05-10 MED-6: a future go-oidc bump that changes the +// wording will trip this test and force isKidMismatchError to be +// re-derived. Without this pin, the JWKS auto-refresh-on-cache-miss +// recovery would silently regress and every post-IdP-rotation login +// would surface as a generic verify error instead of recovering. +func TestIsKidMismatchError_GoOIDCV318Strings(t *testing.T) { + canonical := []string{ + // Direct go-oidc v3.18.0 verifier outputs when no JWK in the + // cached key set matches the token's header kid. + "signing key with id \"key-2\" not found", + "oidc: kid \"new-kid\" not found", + "key with id \"abc\" not found", + "no matching key for kid \"xyz\"", + } + for _, msg := range canonical { + if !isKidMismatchError(errors.New(msg)) { + t.Errorf("canonical kid-mismatch string %q not detected; "+ + "update isKidMismatchError or pin the new substring", msg) + } + } + // Confirm a non-kid verify error does NOT trigger the auto-refresh: + // a wrong signature on a known kid would otherwise produce an + // unbounded refresh loop in production. + if isKidMismatchError(errors.New("invalid signature")) { + t.Errorf("non-kid-mismatch error misclassified as kid-mismatch") + } +} + +// TestService_HandleCallback_MED6_AutoRefreshOnKidMiss exercises the +// MED-6 recovery: the IdP rotates its signing key between provider +// load + token verify; the first verify fails with kid-not-in-cache, +// the auto-RefreshKeys path re-fetches the discovery doc + JWKS, and +// the second verify succeeds against the rotated key. +func TestService_HandleCallback_MED6_AutoRefreshOnKidMiss(t *testing.T) { + idp := newMockIdP(t) + svc, pl := newServiceWithProviderAndPL(t, idp.URL(), "op-med6-rotate") + + // Prime the verifier cache with the initial key by running one + // successful handshake. + cookie, _, err := pl.CreatePreLogin(context.Background(), "op-med6-rotate", "init-state", "test-nonce-fixed", "verifier-med6init-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "", "") + if err != nil { + t.Fatalf("CreatePreLogin (init): %v", err) + } + if _, err := svc.HandleCallback(context.Background(), cookie, "code", "init-state", "", "ip", "ua"); err != nil { + t.Fatalf("HandleCallback (init): %v", err) + } + + // Rotate the IdP's signing key + key id. Subsequent token-sign + // operations use the new key; the cached JWKS still holds the old + // public key, so the next Verify trips kid-not-in-cache until the + // MED-6 auto-refresh kicks in. + rotateMockIdPKey(t, idp, "test-key-2") + + // Issue a new handshake; this hits the rotated key + the auto- + // refresh recovery path. + cookie2, _, err := pl.CreatePreLogin(context.Background(), "op-med6-rotate", "post-state", "test-nonce-fixed", "verifier-med6post-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "", "") + if err != nil { + t.Fatalf("CreatePreLogin (post-rotate): %v", err) + } + res, err := svc.HandleCallback(context.Background(), cookie2, "code-rot", "post-state", "", "ip", "ua") + if err != nil { + t.Fatalf("HandleCallback (post-rotate, expected MED-6 auto-refresh): %v", err) + } + if res == nil || res.User == nil { + t.Fatalf("post-rotate CallbackResult missing user") + } +} + +// rotateMockIdPKey replaces the mockIdP's RSA signing key + key id so +// subsequent ID tokens are signed under a fresh kid the cached JWKS +// doesn't contain. Used by the MED-6 regression test. +func rotateMockIdPKey(t *testing.T, idp *mockIdP, newKeyID string) { + t.Helper() + key, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + t.Fatalf("rsa.GenerateKey (rotate): %v", err) + } + signer, err := jose.NewSigner( + jose.SigningKey{Algorithm: jose.RS256, Key: key}, + (&jose.SignerOptions{}).WithType("JWT").WithHeader("kid", newKeyID), + ) + if err != nil { + t.Fatalf("jose.NewSigner (rotate): %v", err) + } + idp.key = key + idp.signer = signer + idp.keyID = newKeyID +} + // TestService_DecryptClientSecret_NoKeyReturnsBytesAsIs covers the // empty-key short-circuit (used by tests with plaintext blobs). func TestService_DecryptClientSecret_NoKeyReturnsBytesAsIs(t *testing.T) {