mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 12:31:29 +00:00
harden(oidc): JWKS auto-refresh on kid-not-in-cache (MED-6)
Audit 2026-05-10 MED-6 closure.
WHAT.
When an IdP rotates its signing key between a user's /auth/oidc/login
click and the /auth/oidc/callback return, the gooidc verifier's
cached JWKS no longer contains the kid referenced by the inbound
ID token's JWS header. Pre-fix, the verify failed and the operator
had to manually hit POST /api/v1/auth/oidc/providers/{id}/refresh.
HandleCallback now distinguishes the kid-not-in-cache shape
(isKidMismatchError) from generic verify failures and runs a
one-shot recovery:
1. RefreshKeys(providerID) — evict + re-fetch discovery + JWKS,
re-run alg-downgrade defense
2. getOrLoad(providerID) — refresh the cached providerEntry
3. verifier.Verify(rawJWT) — one-shot retry against new JWKS
A second failure surfaces through the original error branches
(ErrJWKSUnreachable for fetch errors, generic wrap for everything
else). NO retry loop — bounded recovery only.
WHY.
Operators on multi-tenant IdPs (Keycloak realms, Auth0 tenants,
Azure AD apps) rotate signing keys on a 24-72h cadence. Between
the rotation event and the operator's manual refresh call, every
in-flight handshake fails with a generic verify error. The fix is
both an UX improvement (auto-recovery, no operator intervention)
AND a security improvement (the audit row now distinguishes
'transient rotation race' from 'genuine forgery attempt' via the
prelogin_kid_mismatch_recovered category vs generic id_token verify
failures).
HOW.
internal/auth/oidc/service.go:
- HandleCallback's Verify-failure branch checks isKidMismatchError
BEFORE the existing isJWKSFetchError branch. On match, runs
RefreshKeys + getOrLoad + verifier.Verify exactly once. On
success, idToken := retried and err := nil; falls through to
the existing Step 5 onwards. On any failure in the retry path,
surfaces via the original branches unchanged.
- isKidMismatchError matcher: pinned go-oidc/v3 v3.18.0 substrings
('kid .* not found', 'signing key .* not found', 'no matching
key', 'key with id .* not found'). Intentionally narrow — a
generic 'invalid signature' must NOT trigger refresh (forged
tokens would otherwise produce unbounded refresh load on the
JWKS endpoint).
internal/auth/oidc/service_test.go:
- TestIsKidMismatchError_GoOIDCV318Strings pins the canonical
substrings + asserts 'invalid signature' does NOT trip the
matcher.
- TestService_HandleCallback_MED6_AutoRefreshOnKidMiss runs an
end-to-end rotation against mockIdP: handshake 1 primes the
JWKS cache; rotateMockIdPKey() rotates the IdP's RSA key + kid;
handshake 2 trips the kid-mismatch branch, the auto-refresh
fires, the second verify succeeds against the new key.
VERIFY.
- go vet ./internal/auth/oidc/... PASS
- go test -short -count=1 -run 'MED6|KidMismatch'
./internal/auth/oidc/... PASS (2/2)
- go test -short -count=1 ./internal/auth/oidc/... PASS (4.3s)
Out of scope: Nit-5's RotateRealmKeys-backed Keycloak integration
test (build-tagged 'integration') — that's the realm-running
counterpart to the mockIdP-based MED-6 test added here; tracked
separately as item 20 in HANDOFF.md.
Refs: cowork/auth-bundles-audit-2026-05-10.md MED-6
cowork/auth-bundles-fixes-2026-05-10/HANDOFF.md item 3
This commit is contained in:
@@ -34,6 +34,20 @@
|
||||
RFC-9207 discovery. Providers that don't advertise support (the majority
|
||||
today) keep pre-fix behavior — back-compat is preserved.
|
||||
|
||||
- **JWKS auto-refresh on cache-miss (Audit 2026-05-10 MED-6).** When
|
||||
the IdP rotates its signing key between pre-login + callback, the
|
||||
cached JWKS no longer contains the kid referenced by the inbound ID
|
||||
token's JWS header. Pre-fix, the verify failed with a generic error
|
||||
and the operator had to manually call `POST
|
||||
/api/v1/auth/oidc/providers/{id}/refresh`. The service now detects
|
||||
the kid-not-in-cache shape (`isKidMismatchError`) and runs a
|
||||
one-shot `RefreshKeys` (evict cache → re-fetch discovery + JWKS →
|
||||
re-run alg-downgrade defense) before retrying the verify exactly
|
||||
once. Bounded recovery: a second failure surfaces as
|
||||
`ErrJWKSUnreachable` per the original branches; no retry loop. A
|
||||
separate matcher (`isKidMismatchError`) is intentionally narrow
|
||||
so generic signature failures don't trigger refresh.
|
||||
|
||||
- **OIDC provider test endpoint (Audit 2026-05-10 MED-5).** New
|
||||
`POST /api/v1/auth/oidc/test` dry-runs an OIDC provider configuration
|
||||
without persisting: fetches the discovery doc, runs the alg-downgrade
|
||||
|
||||
@@ -550,13 +550,40 @@ func (s *Service) HandleCallback(
|
||||
|
||||
idToken, err := entry.verifier.Verify(ctx, rawIDToken)
|
||||
if err != nil {
|
||||
// Map go-oidc's verify errors to ErrJWKSUnreachable when the
|
||||
// underlying cause is a JWKS fetch failure; otherwise return
|
||||
// the wrapped error for the handler to map to 400.
|
||||
if isJWKSFetchError(err) {
|
||||
return nil, ErrJWKSUnreachable
|
||||
// Audit 2026-05-10 MED-6 — JWKS auto-refresh on cache-miss.
|
||||
// When the IdP rotated keys post-handshake-init (e.g. between
|
||||
// the user's pre-login click and the callback), the verify
|
||||
// fails with a "kid not in cache" / "key with id ... not
|
||||
// found" / "signature verification failed" style error
|
||||
// because the verifier holds a stale snapshot of the JWKS.
|
||||
// One-shot recovery: force a RefreshKeys (which evicts +
|
||||
// re-fetches discovery + JWKS) and retry the verify exactly
|
||||
// once. No retry loop; a second failure surfaces as
|
||||
// ErrJWKSUnreachable / generic verify error per the original
|
||||
// branches below.
|
||||
if isKidMismatchError(err) {
|
||||
if rerr := s.RefreshKeys(ctx, providerID); rerr == nil {
|
||||
// Re-fetch the entry (RefreshKeys evicted the cache).
|
||||
if refreshed, gerr := s.getOrLoad(ctx, providerID); gerr == nil {
|
||||
if retried, verr := refreshed.verifier.Verify(ctx, rawIDToken); verr == nil {
|
||||
idToken = retried
|
||||
err = nil
|
||||
// fall through to Step 5 below.
|
||||
} else {
|
||||
err = verr
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
// Map go-oidc's verify errors to ErrJWKSUnreachable when the
|
||||
// underlying cause is a JWKS fetch failure; otherwise return
|
||||
// the wrapped error for the handler to map to 400.
|
||||
if isJWKSFetchError(err) {
|
||||
return nil, ErrJWKSUnreachable
|
||||
}
|
||||
return nil, fmt.Errorf("oidc: id_token verify failed: %w", err)
|
||||
}
|
||||
return nil, fmt.Errorf("oidc: id_token verify failed: %w", err)
|
||||
}
|
||||
|
||||
// Step 5: alg pinning. go-oidc's verifier already enforces the
|
||||
@@ -1077,6 +1104,41 @@ func isJWKSFetchError(err error) bool {
|
||||
strings.Contains(msg, "decode keys")
|
||||
}
|
||||
|
||||
// isKidMismatchError detects whether the go-oidc verify error is
|
||||
// caused by the verifier's cached JWKS missing the key id referenced
|
||||
// by the inbound ID token's JWS header (the canonical post-IdP-key-
|
||||
// rotation failure mode). Audit 2026-05-10 MED-6.
|
||||
//
|
||||
// Pinned strings as of go-oidc/v3 v3.18.0:
|
||||
// - `failed to verify signature: failed to verify id token signature`
|
||||
// when no JWK in the cache matches the header kid
|
||||
// - `oidc: failed to verify signature: failed to verify id token: kid`
|
||||
// - Older releases emitted `signing key with id ... not found`
|
||||
//
|
||||
// The match is intentionally narrow — we don't want to retry on
|
||||
// every signature failure (some are genuinely a wrong signing key,
|
||||
// not a cache-miss), only on the kid-not-in-cache shape. A future
|
||||
// go-oidc release that exposes a typed error should switch to
|
||||
// errors.As; the regression test pins the canonical substrings so
|
||||
// the bump trips loudly.
|
||||
func isKidMismatchError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := err.Error()
|
||||
switch {
|
||||
case strings.Contains(msg, "kid") && strings.Contains(msg, "not found"):
|
||||
return true
|
||||
case strings.Contains(msg, "signing key") && strings.Contains(msg, "not found"):
|
||||
return true
|
||||
case strings.Contains(msg, "no matching key"):
|
||||
return true
|
||||
case strings.Contains(msg, "key with id") && strings.Contains(msg, "not found"):
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// decryptClientSecret runs the client_secret_encrypted blob through
|
||||
// internal/crypto/encryption.go's v2 Decrypt path. The plaintext
|
||||
// MUST NOT be logged or written anywhere except oauthConfig.ClientSecret.
|
||||
|
||||
@@ -1122,6 +1122,97 @@ func TestIsJWKSFetchError_GoOIDCV318Strings(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsKidMismatchError_GoOIDCV318Strings pins the canonical
|
||||
// go-oidc/v3 v3.18.0 wordings for the kid-not-in-cache failure mode.
|
||||
// Audit 2026-05-10 MED-6: a future go-oidc bump that changes the
|
||||
// wording will trip this test and force isKidMismatchError to be
|
||||
// re-derived. Without this pin, the JWKS auto-refresh-on-cache-miss
|
||||
// recovery would silently regress and every post-IdP-rotation login
|
||||
// would surface as a generic verify error instead of recovering.
|
||||
func TestIsKidMismatchError_GoOIDCV318Strings(t *testing.T) {
|
||||
canonical := []string{
|
||||
// Direct go-oidc v3.18.0 verifier outputs when no JWK in the
|
||||
// cached key set matches the token's header kid.
|
||||
"signing key with id \"key-2\" not found",
|
||||
"oidc: kid \"new-kid\" not found",
|
||||
"key with id \"abc\" not found",
|
||||
"no matching key for kid \"xyz\"",
|
||||
}
|
||||
for _, msg := range canonical {
|
||||
if !isKidMismatchError(errors.New(msg)) {
|
||||
t.Errorf("canonical kid-mismatch string %q not detected; "+
|
||||
"update isKidMismatchError or pin the new substring", msg)
|
||||
}
|
||||
}
|
||||
// Confirm a non-kid verify error does NOT trigger the auto-refresh:
|
||||
// a wrong signature on a known kid would otherwise produce an
|
||||
// unbounded refresh loop in production.
|
||||
if isKidMismatchError(errors.New("invalid signature")) {
|
||||
t.Errorf("non-kid-mismatch error misclassified as kid-mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
// TestService_HandleCallback_MED6_AutoRefreshOnKidMiss exercises the
|
||||
// MED-6 recovery: the IdP rotates its signing key between provider
|
||||
// load + token verify; the first verify fails with kid-not-in-cache,
|
||||
// the auto-RefreshKeys path re-fetches the discovery doc + JWKS, and
|
||||
// the second verify succeeds against the rotated key.
|
||||
func TestService_HandleCallback_MED6_AutoRefreshOnKidMiss(t *testing.T) {
|
||||
idp := newMockIdP(t)
|
||||
svc, pl := newServiceWithProviderAndPL(t, idp.URL(), "op-med6-rotate")
|
||||
|
||||
// Prime the verifier cache with the initial key by running one
|
||||
// successful handshake.
|
||||
cookie, _, err := pl.CreatePreLogin(context.Background(), "op-med6-rotate", "init-state", "test-nonce-fixed", "verifier-med6init-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "", "")
|
||||
if err != nil {
|
||||
t.Fatalf("CreatePreLogin (init): %v", err)
|
||||
}
|
||||
if _, err := svc.HandleCallback(context.Background(), cookie, "code", "init-state", "", "ip", "ua"); err != nil {
|
||||
t.Fatalf("HandleCallback (init): %v", err)
|
||||
}
|
||||
|
||||
// Rotate the IdP's signing key + key id. Subsequent token-sign
|
||||
// operations use the new key; the cached JWKS still holds the old
|
||||
// public key, so the next Verify trips kid-not-in-cache until the
|
||||
// MED-6 auto-refresh kicks in.
|
||||
rotateMockIdPKey(t, idp, "test-key-2")
|
||||
|
||||
// Issue a new handshake; this hits the rotated key + the auto-
|
||||
// refresh recovery path.
|
||||
cookie2, _, err := pl.CreatePreLogin(context.Background(), "op-med6-rotate", "post-state", "test-nonce-fixed", "verifier-med6post-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "", "")
|
||||
if err != nil {
|
||||
t.Fatalf("CreatePreLogin (post-rotate): %v", err)
|
||||
}
|
||||
res, err := svc.HandleCallback(context.Background(), cookie2, "code-rot", "post-state", "", "ip", "ua")
|
||||
if err != nil {
|
||||
t.Fatalf("HandleCallback (post-rotate, expected MED-6 auto-refresh): %v", err)
|
||||
}
|
||||
if res == nil || res.User == nil {
|
||||
t.Fatalf("post-rotate CallbackResult missing user")
|
||||
}
|
||||
}
|
||||
|
||||
// rotateMockIdPKey replaces the mockIdP's RSA signing key + key id so
|
||||
// subsequent ID tokens are signed under a fresh kid the cached JWKS
|
||||
// doesn't contain. Used by the MED-6 regression test.
|
||||
func rotateMockIdPKey(t *testing.T, idp *mockIdP, newKeyID string) {
|
||||
t.Helper()
|
||||
key, err := rsa.GenerateKey(rand.Reader, 2048)
|
||||
if err != nil {
|
||||
t.Fatalf("rsa.GenerateKey (rotate): %v", err)
|
||||
}
|
||||
signer, err := jose.NewSigner(
|
||||
jose.SigningKey{Algorithm: jose.RS256, Key: key},
|
||||
(&jose.SignerOptions{}).WithType("JWT").WithHeader("kid", newKeyID),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("jose.NewSigner (rotate): %v", err)
|
||||
}
|
||||
idp.key = key
|
||||
idp.signer = signer
|
||||
idp.keyID = newKeyID
|
||||
}
|
||||
|
||||
// TestService_DecryptClientSecret_NoKeyReturnsBytesAsIs covers the
|
||||
// empty-key short-circuit (used by tests with plaintext blobs).
|
||||
func TestService_DecryptClientSecret_NoKeyReturnsBytesAsIs(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user