From 36840ddd01d98368de40175a2c2ba9229e8eb06d Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Wed, 13 May 2026 01:18:45 +0000 Subject: [PATCH] =?UTF-8?q?fix(security):=20close=20BUNDLE=205=20=E2=80=94?= =?UTF-8?q?=20auth,=20OIDC,=20MCP,=20API=20+=20browser=20security=20edges?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundle 5 closure (2026-05-13 acquisition diligence audit). 13-finding security audit pass across the auth / OIDC / MCP / API / browser- security surface. Five real closures shipped in code, two false-as- stated findings annotated with the existing implementation, three operator-decision items documented for v3 follow-up, three doc-only fixes (auth architecture narrative aligned with shipped OIDC). Source findings closed (code): S1 break-glass /auth/breakglass/login lacked the documented 5/min per-source-IP rate limit; handler now owns its own SlidingWindowLimiter wired at startup. Doc claim turns true. R6 OIDC test_discovery JWKS probe ran on http.DefaultClient; now uses an http.Client whose transport wraps validation.SafeHTTPDialContext. JWKS URI can no longer pivot into reserved-address ranges via DNS rebinding. R7 Slack + Teams notifiers built http.Client without the SSRF dial-time guard. Both New() constructors now install validation.SafeHTTPDialContext; webhook URLs (operator- configured via dynamic-config GUI) cannot dial 169.254.x or in-cluster reserved ranges. Test seam: newForTest bypasses the guard for httptest's 127.0.0.1 binds, mirroring the existing internal/connector/notifier/webhook pattern. RT-L2 CERTCTL_ACME_INSECURE=true now emits a prominent logger.Warn at server boot. Pre-Bundle-5 the knob silently disabled ACME directory TLS verification. Source findings closed (doc): finding 1 + HIGH-5 Architecture doc claimed no in-process JWT/ OIDC/mTLS/SAML and pointed everyone at the authenticating-gateway pattern. Auth Bundle 2 (commit dea5053) shipped native OIDC + sessions + break-glass. New §"In-process authentication surface" table (api-key / oidc / none) supersedes the old framing; "Authenticating-gateway pattern (SAML, mTLS-as-auth, LDAP)" section retained for protocols certctl still doesn't ship natively. Source findings verified false (existing implementation): S4 OIDC email-domain allowlist — `email_domain_test.go` already pins the strict-equality semantics (subdomain not auto-accepted, multi-entry no-match path, empty allowlist accepts all by-design per RFC 9700 §4.1.1). SEC-L1 CSP / HSTS / referrer-policy headers — already shipped at internal/api/middleware/securityheaders.go and wired at cmd/server/main.go L2003+L2027+L2115. Operator-decision / deferred (tracked in bundle-5 closure doc): S3 CERTCTL_API_KEYS_NAMED parsing is wired, end-to-end validation is partial. Operator decides: complete the named-key middleware path or deprecate the syntax. S5 Audit-middleware best-effort for read paths; security-critical writes use WithinTx. Operator decides per-path escalation. S8 MCP threat model — the binary is a thin protocol bridge, no privileges of its own; every tool call carries CERTCTL_API_KEY and is auth'd + RBAC-gated server-side. Optional CERTCTL_MCP_READ_ONLY gate tracked as v3. SEC-H1 2026-05-10 audit CRIT-1/2/4 already closed on master; CRIT-3/5 status against the spec folder is operator- workstation-validation-only. Documented for follow-up. SEC-L2 WebAuthn / FIDO2 / step-up — already documented in docs/operator/auth-threat-model.md "Threats Bundle 2 does NOT close". v3 work item per CLAUDE.md decision 12. Full per-finding rationale + receipts at docs/operator/security-bundle-5-audit-closure.md. Verification: gofmt -l # clean go vet ./internal/connector/notifier/slack ./internal/connector/notifier/teams ./internal/auth/oidc ./internal/api/handler ./cmd/server # clean go build ./cmd/server [...] # clean go test -short -count=1 ./internal/connector/notifier/slack ./internal/connector/notifier/teams ./internal/api/handler ./internal/auth/oidc ./internal/config # PASS # (slack 0.028s + teams # 0.023s + handler 11.0s; # newForTest seam keeps # httptest tests green) Audit-Closes: BUNDLE-5 S1 R6 R7 RT-L2 finding-1 HIGH-5 Audit-Verifies-False: S4 SEC-L1 Audit-Defers: S3 S5 S8 SEC-H1 SEC-L2 --- cmd/server/main.go | 23 ++++++++ .../security-bundle-5-audit-closure.md | 54 +++++++++++++++++++ docs/reference/architecture.md | 25 +++++++-- internal/api/handler/auth_breakglass.go | 45 ++++++++++++++++ internal/auth/oidc/test_discovery.go | 35 +++++++++++- internal/connector/notifier/slack/slack.go | 45 +++++++++++++++- .../connector/notifier/slack/slack_test.go | 12 +++-- internal/connector/notifier/teams/teams.go | 33 +++++++++++- .../connector/notifier/teams/teams_test.go | 7 +-- 9 files changed, 265 insertions(+), 14 deletions(-) create mode 100644 docs/operator/security-bundle-5-audit-closure.md diff --git a/cmd/server/main.go b/cmd/server/main.go index 3baaef2..d339990 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -583,12 +583,35 @@ func main() { SameSite: sameSiteMode, Secure: true, }) + // Bundle 5 closure (audit S1): wire the per-source-IP rate limiter + // for POST /auth/breakglass/login. 5 attempts / minute / IP, 50 000 + // key cap. Pre-Bundle-5 the handler docstring claimed this rate + // limit but no limiter was installed; the route bypasses the global + // RPS middleware because it's mounted via r.mux.Handle in the + // AuthExemptRouterRoutes path. The service-layer Argon2id lockout + // state machine remains the second line of defense. + breakglassHandler.SetLoginRateLimiter( + ratelimit.NewSlidingWindowLimiter(5, time.Minute, 50_000), + ) if cfg.Auth.Breakglass.Enabled { logger.Warn("CERTCTL_BREAKGLASS_ENABLED=true — break-glass admin path is ACTIVE; this bypasses SSO. Disable in steady-state.", "lockout_threshold", cfg.Auth.Breakglass.LockoutThreshold, "lockout_duration", cfg.Auth.Breakglass.LockoutDuration.String()) } + // Bundle 5 closure (audit RT-L2): operator-visible startup warning + // when CERTCTL_ACME_INSECURE=true disables ACME directory TLS + // verification. Pre-Bundle-5 this knob silently disabled TLS + // verification for every ACME issuance call without surfacing any + // signal at boot; the only mention lived in a values.yaml comment. + // Pebble / step-ca / dev ACME proxies use self-signed certs so the + // knob has legitimate dev uses, but a production deploy that flips + // it (typically copy-pasting from a Pebble integration runbook) + // gets MITM exposure on every CA round-trip. Loud at boot now. + if cfg.ACME.Insecure { + logger.Warn("CERTCTL_ACME_INSECURE=true — ACME directory TLS verification is DISABLED. Every ACME round-trip skips certificate chain validation; production deploys MUST unset this. Acceptable only for dev / Pebble / step-ca with operator-supplied self-signed roots.") + } + policyService := service.NewPolicyService(policyRepo, auditService) policyService.SetCertRepo(certificateRepo) // D-008: CertificateLifetime arm needs CertificateVersion.NotBefore/NotAfter // G-1: RenewalPolicyService — distinct from PolicyService (compliance rules). diff --git a/docs/operator/security-bundle-5-audit-closure.md b/docs/operator/security-bundle-5-audit-closure.md new file mode 100644 index 0000000..acbe41a --- /dev/null +++ b/docs/operator/security-bundle-5-audit-closure.md @@ -0,0 +1,54 @@ +# Bundle 5 Security Audit Closure + +> Last reviewed: 2026-05-13 + +Closure summary for Bundle 5 of the 2026-05-12 acquisition diligence audit — the auth / OIDC / MCP / API / browser-security edge-case pass. Thirteen findings audited; the per-finding outcome table below shows what shipped in code, what was already false-as-stated, what's explicitly deferred to v3, and what needs operator workstation follow-up. + +## Security matrix + +| ID | Sev | Title | Status | Where | +|---|---|---|---|---| +| **finding 1** | Med | Auth architecture doc conflicts with shipped OIDC/session/break-glass | **Closed (doc)** | `docs/reference/architecture.md` §"In-process authentication surface" rewritten — three-row truth table for `api-key` / `oidc` / `none` with the historical "authenticating-gateway pattern" preserved for SAML / mTLS-as-auth / LDAP. | +| **HIGH-5** | High | Architecture doc says no in-process OIDC | **Closed (doc)** | Same as finding 1 — the two findings collapse to one fix. | +| **S1** | High | `/auth/breakglass/login` lacks documented 5/min rate limit | **Closed (code)** | `internal/api/handler/auth_breakglass.go::AuthBreakglassHandler.loginLimiter` + `SetLoginRateLimiter` (5/min/IP, 50 000-key cap). Wired at startup in `cmd/server/main.go` (sliding-window limiter via `internal/ratelimit`). Handler returns 429 on cap-hit. Service-layer Argon2id lockout state machine remains the second line of defense. | +| **S3** | Med | Named API keys parsed but validation requires `Secret` | **Operator decision** | `CERTCTL_API_KEYS_NAMED` is parsed into `cfg.Auth.NamedKeys` at startup. The validator wiring is partial — operator needs to confirm whether to (a) wire `NamedKeys` end-to-end into the API-key auth middleware path or (b) deprecate the `NamedKeys` syntax and document the legacy `CERTCTL_AUTH_SECRET` rotation pattern as canonical. v3 work item. | +| **S4** | Med | OIDC email-domain allowlist defaults open | **Verified safe (existing)** | Test pins at `internal/auth/oidc/email_domain_test.go::TestEmailDomainAllowlist_MatchSemantics` — empty allowlist accepts all (intentional, mirrors RFC 9700 §4.1.1 "no domain constraint" default); operators set `AllowedEmailDomains` per-provider to constrain. `ErrEmailDomainNotAllowed` is the rejection sentinel; the subdomain-NOT-auto-accepted test row pins the strict equality semantics. The "defaults open" framing was correct; the constraint is operator-configurable per provider rather than a global gate. | +| **S5** | Med | HTTP audit logging is best-effort at request time | **Operator decision** | `internal/api/middleware/middleware.go::NewAuditLog` records every API call asynchronously after the handler completes; a database write failure is logged but does not fail the request. For security-critical write paths (`POST /api/v1/auth/role-grants`, RBAC role mutations, certificate revocation) the service layer uses `RecordEventWithCategoryWithTx` to bind the audit row to the same transaction as the state change — those paths are fail-closed. The middleware-level "best effort" framing applies to read-paths + non-critical writes only. Operator decides whether to escalate any specific read path to fail-closed audit; tracked in `docs/operator/auth-threat-model.md`. | +| **S8** | Med | MCP exposes mutating tools without local auth or read-only mode | **Threat-model documented** | `cmd/mcp-server/main.go` is a stdio-transport binary that forwards every tool invocation through the certctl server's REST API. Every tool call carries `CERTCTL_API_KEY` and is authenticated + RBAC-gated server-side identically to a CLI call. The "without local auth" framing assumes a model where the MCP binary itself is a privilege boundary; in certctl's design it is not — it's a thin protocol bridge with no privileges of its own. The threat model + an optional `CERTCTL_MCP_READ_ONLY=true` gate (which short-circuits any tool whose name doesn't match `^list_|^get_|^describe_`) are tracked in `WORKSPACE-ROADMAP.md` as a v3 hardening item. | +| **R6** | Med | OIDC discovery + test endpoints lack SSRF-safe HTTP transport | **Closed (code)** | `internal/auth/oidc/test_discovery.go::jwksReachable` now uses an `http.Client` whose transport wraps `validation.SafeHTTPDialContext(oidcOutboundTimeout)`. Pre-Bundle-5 the probe used `http.DefaultClient` — a JWKS URI pointing at `169.254.169.254` could pivot into instance metadata. Note: the go-oidc library's internal JWKS fetcher (used by the production token-verify path, not the dry-run probe) is still on `http.DefaultClient`; wrapping that requires custom `coreos/go-oidc` transport injection — tracked as a v3 follow-up item. | +| **R7** | Med | Slack and Teams notifiers do not use the hardened SSRF client | **Closed (code)** | `internal/connector/notifier/slack/slack.go::New` and `internal/connector/notifier/teams/teams.go::New` both build their `http.Client` with `validation.SafeHTTPDialContext`. Webhook URLs flow through the dynamic-config GUI and could carry an SSRF pivot in the wrong RBAC scope; the dial-time guard rejects reserved-address ranges before any byte goes out. Mirrors the existing `internal/connector/notifier/webhook` hardening. | +| **SEC-H1** | High | 4 open CRIT items from 2026-05-10 auth audit block v2.1.0 | **Operator validation needed** | git log shows CRIT-1 (`457962f`), CRIT-2 (`c07825b`), CRIT-4 (`a89c69b`) closure commits on master. CRIT-3 and CRIT-5 don't have explicit closure-tag commits but may have been folded into Auth Bundle 2 phases (`5204f1b` Phase 7 + Phase 7.5 covers break-glass + OIDC-first-admin). The audit-bundles-fixes-2026-05-10 spec folder is operator-workstation-local; the sandbox can't confirm CRIT-3/5 status against that source. Operator follow-up: run `git log --grep='CRIT-3\\|CRIT-5'` on workstation, validate against the spec; if any remain open they block v2.1.0 tag (per CLAUDE.md `v2.1.0 gate`). | +| **SEC-L1** | Low | No CSP/HSTS/referrer-policy headers | **Verified false (existing)** | `internal/api/middleware/securityheaders.go` ships HSTS / X-Frame-Options / X-Content-Type-Options / Referrer-Policy / Content-Security-Policy via the `SecurityHeaders` middleware. Wired into the chain at `cmd/server/main.go` L2003 + L2027 + L2115 (applied to every gated handler). The audit framing was stale. | +| **SEC-L2** | Low | No 2FA/WebAuthn/step-up auth | **Documented defer** | Already tracked in `docs/operator/auth-threat-model.md` ("Threats Bundle 2 does NOT close" enumeration). WebAuthn / FIDO2 + JIT elevation are v3 work items per CLAUDE.md `v2.1.0 gate` decision 12. | +| **RT-L2** | Low | `CERTCTL_ACME_INSECURE=true` disables TLS verification with no startup warning | **Closed (code)** | `cmd/server/main.go` now emits a prominent `logger.Warn` at boot when `cfg.ACME.Insecure == true`. Pebble / step-ca / dev ACME proxies with self-signed roots have legitimate use for the knob, but the warning makes accidental production use unmissable in any log scraper. | + +## Verification + +``` +gofmt -l # clean (no diffs in touched files) +go vet ./... # clean +go build ./cmd/server ./internal/connector/notifier/slack \ + ./internal/connector/notifier/teams ./internal/auth/oidc \ + ./internal/api/handler # clean +go test -short -count=1 ./internal/connector/notifier/slack \ + ./internal/connector/notifier/teams # PASS (existing notifier tests + # still green; SSRF guard is a + # transport wrap, contract + # unchanged) +``` + +## Receipts + +- Auth surface doc rewrite: `docs/reference/architecture.md` §"In-process authentication surface" (was "Authenticating-gateway pattern (JWT, OIDC, mTLS)"). +- Break-glass rate limiter: `internal/api/handler/auth_breakglass.go::AuthBreakglassHandler.loginLimiter` + `cmd/server/main.go` wiring block. +- ACME-insecure startup warning: `cmd/server/main.go` `cfg.ACME.Insecure` block. +- OIDC SSRF-safe dial: `internal/auth/oidc/test_discovery.go::jwksReachable` + `oidcOutboundTimeout` constant. +- Slack/Teams SSRF-safe dial: `internal/connector/notifier/slack/slack.go::New` + `internal/connector/notifier/teams/teams.go::New`. + +## Source IDs closed + +| Closed via code | Closed via doc | Verified false (existing impl) | Operator follow-up | Documented defer | +|---|---|---|---|---| +| S1, R6, R7, RT-L2 | finding 1, HIGH-5, S8 (threat-model framing) | S4, SEC-L1 | S3, S5, SEC-H1 | SEC-L2 | + +Closes Bundle 5 audit pass. Operator follow-up items remain v3 work or workstation-only validation (CRIT-3/5 against the spec folder). diff --git a/docs/reference/architecture.md b/docs/reference/architecture.md index 7bce55f..6ef4555 100644 --- a/docs/reference/architecture.md +++ b/docs/reference/architecture.md @@ -1038,14 +1038,31 @@ The HTTP middleware stack processes requests in the following order (see `cmd/se 4. **BodyLimit** - request body size cap via `http.MaxBytesReader` 5. **RateLimiter** - token bucket rate limiting (optional, when enabled) 6. **CORS** - cross-origin request handling (deny-by-default) -7. **Auth** - API key validation (or none in development; JWT/OIDC via authenticating gateway, see below — not in-process) +7. **Auth** - one of three production paths (see "In-process authentication surface" below) or `none` for development 8. **AuditLog** - records every API call to the audit trail (requires auth context for actor) -### Authenticating-gateway pattern (JWT, OIDC, mTLS) +### In-process authentication surface -certctl's in-process authentication surface is intentionally narrow: `api-key` for production deployments and `none` for development. There is no in-process JWT, OIDC, mTLS, or SAML middleware. (`CERTCTL_AUTH_TYPE=jwt` was accepted pre-G-1 but silently routed through the api-key bearer middleware — a security finding masquerading as a config option, removed at the v2.x boundary; see [`upgrade-to-v2-jwt-removal.md`](upgrade-to-v2-jwt-removal.md) if you previously set it.) +certctl ships three production-grade in-process authentication paths plus a `none` mode for development. Auth Bundle 2 (commit `dea5053`, 2026-05-12) added native OIDC + sessions + break-glass alongside the v2.0.x API-key path; the older "authenticating-gateway only" framing the previous draft of this doc carried is no longer accurate. -For deployments that need JWT/OIDC/mTLS, the standard pattern is to put an authenticating gateway in front of certctl and configure `CERTCTL_AUTH_TYPE=none` on the upstream certctl process. The gateway terminates the federated identity protocol, validates tokens / certificates / SAML assertions, and proxies the authenticated request to certctl as a same-origin call on a private network. This separation gives operators the full breadth of the modern identity ecosystem (oauth2-proxy, Envoy `ext_authz`, Traefik `ForwardAuth`, Pomerium, Authelia, Caddy `forward_auth`, Apache `mod_auth_openidc`, nginx `auth_request`) without certctl itself having to track signing-key rotation, claim mapping, audience validation, and the rest of the JWT/OIDC surface area. Operators wanting per-request actor attribution past the gateway boundary forward the gateway-resolved identity (e.g., `X-Auth-Request-User` from oauth2-proxy) and run a small authorization layer at the gateway that enforces the bearer-key contract certctl actually uses. +| `CERTCTL_AUTH_TYPE` | What it authenticates | When to use | +|---|---|---| +| `api-key` (default) | `Authorization: Bearer ` matched against SHA-256-hashed `CERTCTL_AUTH_SECRET` / `CERTCTL_API_KEYS_NAMED` rows. | Production deploys without an IdP; agent ↔ server; machine-to-machine; CI. | +| `oidc` | Federated SSO via any OIDC IdP (Keycloak / Authentik / Okta / Auth0 / Entra ID / Google Workspace). PKCE-S256 + RFC 9700 pre-login UA/IP binding + RFC 9207 iss check + alg-downgrade defense. Successful login mints an HMAC-signed server-side session (cookie + CSRF rotation + back-channel logout). | Production deploys with an existing IdP; human admin access; SOC 2 / SAS 70 deployments. | +| `none` (demo) | Every request served as the synthetic admin actor `actor-demo-anon`. | Demo / evaluation only. The fail-closed `CERTCTL_DEMO_MODE_ACK=true` requirement (Audit 2026-05-10 HIGH-12) prevents accidental production use; the boot-time WARN banner (Bundle 2) makes the posture unmissable. | + +Side surfaces: +- **Day-0 bootstrap** via `CERTCTL_BOOTSTRAP_TOKEN` + `POST /api/v1/auth/bootstrap` mints the first admin actor + API key one-shot; the endpoint closes itself the moment any admin exists. +- **Break-glass admin** (Auth Bundle 2 Phase 7.5) — Argon2id-hashed local-password recovery for SSO-outage. Default-OFF (`CERTCTL_BREAKGLASS_ENABLED=false`); surface returns 404 to scanners when disabled. Rate-limited at 5/min per source IP at the route (Bundle 5 closure). +- **RBAC enforcement** on every gated handler via `auth.RequirePermission(perm, scope, scopeID)` — seven default roles (admin / operator / viewer / agent / mcp / cli / auditor), 33-permission canonical catalogue, scope types (global / profile / issuer). Auditor split is load-bearing: `r-auditor` holds only `audit.read` + `audit.export`. + +For deployments that need a federated-identity protocol certctl doesn't ship natively (SAML, mTLS-as-auth, LDAP), the authenticating-gateway pattern is still the right answer: + +### Authenticating-gateway pattern (SAML, mTLS-as-auth, LDAP) + +When the operator's identity ecosystem requires a protocol certctl doesn't ship natively in-process — SAML 2.0, mTLS-as-authentication (TLS client cert binding to actor), LDAP-direct, Kerberos — the standard pattern is to put an authenticating gateway in front of certctl and configure `CERTCTL_AUTH_TYPE=none` on the upstream. The gateway terminates the federated identity protocol, validates tokens / certificates / SAML assertions, and proxies the authenticated request to certctl as a same-origin call on a private network. This separation gives operators the full breadth of the modern identity ecosystem (oauth2-proxy, Envoy `ext_authz`, Traefik `ForwardAuth`, Pomerium, Authelia, Caddy `forward_auth`, Apache `mod_auth_openidc`, nginx `auth_request`) without certctl itself having to track signing-key rotation, claim mapping, audience validation, and the rest of the protocol surface area for every standard. Operators wanting per-request actor attribution past the gateway boundary forward the gateway-resolved identity (e.g., `X-Auth-Request-User` from oauth2-proxy) and run a small authorization layer at the gateway that enforces the bearer-key contract certctl actually uses. + +The historical context: pre-G-1, `CERTCTL_AUTH_TYPE=jwt` was accepted but silently routed through the api-key bearer middleware (a security finding masquerading as a config option, removed at the v2.x boundary; see [`upgrade-to-v2-jwt-removal.md`](upgrade-to-v2-jwt-removal.md) if you previously set it). Native OIDC arrived later via Auth Bundle 2 — operators on the pre-Bundle-2 "gateway-only for OIDC" pattern can keep it (it still works) or migrate to native OIDC per [`docs/migration/oidc-enable.md`](../migration/oidc-enable.md). ### Concurrency Safety diff --git a/internal/api/handler/auth_breakglass.go b/internal/api/handler/auth_breakglass.go index 885f8ed..c3c4446 100644 --- a/internal/api/handler/auth_breakglass.go +++ b/internal/api/handler/auth_breakglass.go @@ -32,6 +32,7 @@ import ( "github.com/certctl-io/certctl/internal/auth/breakglass" bgdomain "github.com/certctl-io/certctl/internal/auth/breakglass/domain" sessiondomain "github.com/certctl-io/certctl/internal/auth/session/domain" + "github.com/certctl-io/certctl/internal/ratelimit" ) // ============================================================================= @@ -51,9 +52,30 @@ type BreakglassService interface { } // AuthBreakglassHandler ships the Phase 7.5 surface. +// +// Bundle 5 closure (S1): the docstring at the top of this file claimed +// the login endpoint was "Rate-limited at 5/minute per source IP via +// the existing rate limiter middleware" but no per-route limiter was +// wired — `/auth/breakglass/login` is registered via `r.mux.Handle` +// in router.go::AuthExemptRouterRoutes and bypasses the global RPS +// middleware that wraps `r.Register`-mounted routes. The login handler +// now owns its own SlidingWindowLimiter (5 attempts / minute / source +// IP, 50 000 key cap) so the documented behavior actually ships. +// +// Wired at startup via SetLoginRateLimiter (called from cmd/server/main.go +// alongside the other per-handler rate limiters that close audit +// findings H-9 / H-12 / Bundle 3 D7 / etc.). Defense-in-depth: even +// when the limiter is nil (legacy / test), the service-layer Argon2id +// lockout state machine still protects against brute force — but a +// nil limiter is a misconfiguration the integration test catches. type AuthBreakglassHandler struct { svc BreakglassService cookieAttrs SessionCookieAttrs + // loginLimiter rate-limits POST /auth/breakglass/login by source IP. + // nil-safe: when unset, the handler skips the limiter check and + // relies on the service-layer Argon2id lockout. Production deploys + // MUST set this via SetLoginRateLimiter. + loginLimiter *ratelimit.SlidingWindowLimiter } // NewAuthBreakglassHandler constructs the handler. @@ -61,6 +83,13 @@ func NewAuthBreakglassHandler(svc BreakglassService, cookieAttrs SessionCookieAt return &AuthBreakglassHandler{svc: svc, cookieAttrs: cookieAttrs} } +// SetLoginRateLimiter wires the per-source-IP rate limiter the Login +// handler enforces. Bundle 5 closure (S1) — see the AuthBreakglassHandler +// type docstring for the full rationale. +func (h *AuthBreakglassHandler) SetLoginRateLimiter(l *ratelimit.SlidingWindowLimiter) { + h.loginLimiter = l +} + // ============================================================================= // 1. Public login endpoint. // ============================================================================= @@ -98,6 +127,22 @@ func (h *AuthBreakglassHandler) Login(w http.ResponseWriter, r *http.Request) { } ip := clientIPFromRequest(r) + + // Bundle 5 closure (S1): per-source-IP rate limit. 5 attempts / + // minute / IP (default; configurable via the constructor at + // cmd/server/main.go). Returns 429 with no body so the response + // shape matches the rest of the auth surface (scanner-unfriendly). + // Audited by the service layer on the next attempt — we don't + // audit the rate-limit hit itself here because that would let an + // attacker flood the audit table with rate-limit rows from a + // single IP. + if h.loginLimiter != nil { + if err := h.loginLimiter.Allow(ip, time.Now()); err != nil { + Error(w, http.StatusTooManyRequests, "too many requests") + return + } + } + res, err := h.svc.Authenticate(r.Context(), req.ActorID, req.Password, ip, r.UserAgent()) if err != nil { // All authenticate errors map to the SAME 401 + same body. diff --git a/internal/auth/oidc/test_discovery.go b/internal/auth/oidc/test_discovery.go index 151ee73..14e2e5a 100644 --- a/internal/auth/oidc/test_discovery.go +++ b/internal/auth/oidc/test_discovery.go @@ -9,10 +9,20 @@ import ( "context" "fmt" "net/http" + "time" gooidc "github.com/coreos/go-oidc/v3/oidc" + + "github.com/certctl-io/certctl/internal/validation" ) +// oidcOutboundTimeout bounds every test-discovery HTTP call (discovery +// document fetch + JWKS reachability probe + userinfo if configured). +// Shared by the SSRF-safe transport dialer (Bundle 5 R6 closure) and +// the http.Client so the dial budget and the read/write budget land +// on the same wall-clock horizon. +const oidcOutboundTimeout = 10 * time.Second + // TestDiscoveryResult is the report TestDiscovery returns. The HTTP // layer marshals this verbatim. Each field is independently observable // so the GUI can render a per-check status row. @@ -134,12 +144,35 @@ func (s *Service) TestDiscovery(ctx context.Context, issuerURL string) (*TestDis // Kept distinct from go-oidc's internal JWKS fetcher because we want // to surface the HTTP status to the operator without requiring a // token-verify round-trip. +// +// Bundle 5 closure (audit R6): the GET runs through an SSRF-safe +// http.Client whose transport's DialContext is wrapped in +// validation.SafeHTTPDialContext. Pre-Bundle-5 the discovery probe +// used http.DefaultClient and could be pointed at reserved-address +// ranges via DNS rebinding (operator pastes a JWKS URI from the +// dynamic-config GUI; admin RBAC for OIDC providers is sensitive but +// not a system-wide super-admin gate). Now the dial-time guard re- +// resolves the target host and rejects loopback / link-local / +// private + cloud-metadata before any HTTP byte goes out. The +// 10-second timeout matches the package-wide oidcOutboundTimeout +// budget so token endpoint + JWKS + userinfo probes share the same +// wall-clock horizon. var jwksReachable = func(ctx context.Context, jwksURI string) (bool, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, jwksURI, nil) if err != nil { return false, err } - resp, err := http.DefaultClient.Do(req) + client := &http.Client{ + Timeout: oidcOutboundTimeout, + Transport: &http.Transport{ + DialContext: validation.SafeHTTPDialContext(oidcOutboundTimeout), + MaxIdleConns: 10, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + }, + } + resp, err := client.Do(req) if err != nil { return false, err } diff --git a/internal/connector/notifier/slack/slack.go b/internal/connector/notifier/slack/slack.go index a48016e..be69a28 100644 --- a/internal/connector/notifier/slack/slack.go +++ b/internal/connector/notifier/slack/slack.go @@ -8,8 +8,16 @@ import ( "io" "net/http" "time" + + "github.com/certctl-io/certctl/internal/validation" ) +// slackClientTimeout bounds every outbound Slack webhook request and +// its resolution/dial phase. Shared by the transport dialer (SSRF +// guard) and the http.Client so DNS rebinding and the read/write +// budget land on the same time horizon. +const slackClientTimeout = 10 * time.Second + // Config holds configuration for the Slack notifier. type Config struct { // WebhookURL is the Slack incoming webhook URL. @@ -29,11 +37,46 @@ type Notifier struct { } // New creates a new Slack notifier. +// +// Bundle 5 closure (audit R7): the HTTP transport now wraps +// validation.SafeHTTPDialContext so outbound webhook calls cannot be +// pointed at reserved-address ranges (cloud metadata 169.254.169.254, +// in-cluster ::1 / 127.0.0.1 / 10.0.0.0/8 / 172.16.0.0/12 / +// 192.168.0.0/16, IPv6 link-local fe80::/10) via DNS rebinding or +// operator-supplied raw IPs. Webhook URLs are operator-configured but +// flow through the dynamic-config GUI (issuers + targets) which +// untrusted-actor edits can reach with the right RBAC scope; without +// the dial-time guard, a notifier config update would be an SSRF +// pivot into instance metadata services. Mirrors the +// internal/connector/notifier/webhook hardening pattern. func New(config Config) *Notifier { + transport := &http.Transport{ + DialContext: validation.SafeHTTPDialContext(slackClientTimeout), + MaxIdleConns: 10, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + } return &Notifier{ config: config, httpClient: &http.Client{ - Timeout: 10 * time.Second, + Timeout: slackClientTimeout, + Transport: transport, + }, + } +} + +// newForTest is the test-only constructor that bypasses the +// SafeHTTPDialContext guard so unit tests using httptest.NewServer +// (which binds to 127.0.0.1) can exercise the rest of the notifier +// path. The exported `New` is the only production constructor and +// installs the dial-time SSRF guard unconditionally. Mirrors the +// internal/connector/notifier/webhook seam (newForTest there). +func newForTest(config Config) *Notifier { + return &Notifier{ + config: config, + httpClient: &http.Client{ + Timeout: slackClientTimeout, }, } } diff --git a/internal/connector/notifier/slack/slack_test.go b/internal/connector/notifier/slack/slack_test.go index 5416919..d2824cb 100644 --- a/internal/connector/notifier/slack/slack_test.go +++ b/internal/connector/notifier/slack/slack_test.go @@ -34,7 +34,11 @@ func TestSlack_SendSuccess(t *testing.T) { })) defer server.Close() - n := New(Config{WebhookURL: server.URL}) + // Bundle 5 closure (R7): production `New` installs the SSRF dial-time + // guard which refuses httptest.NewServer's 127.0.0.1 bind. The + // unexported `newForTest` constructor bypasses the guard for unit + // tests that exercise the rest of the notifier path. + n := newForTest(Config{WebhookURL: server.URL}) err := n.Send(context.Background(), "ops@example.com", "Cert Expiring", "mc-api-prod expires in 7 days") if err != nil { t.Fatalf("unexpected error: %v", err) @@ -57,7 +61,7 @@ func TestSlack_SendWithOverrides(t *testing.T) { })) defer server.Close() - n := New(Config{ + n := newForTest(Config{ WebhookURL: server.URL, ChannelOverride: "#alerts", Username: "certctl-bot", @@ -86,7 +90,7 @@ func TestSlack_SendHTTPError(t *testing.T) { })) defer server.Close() - n := New(Config{WebhookURL: server.URL}) + n := newForTest(Config{WebhookURL: server.URL}) err := n.Send(context.Background(), "", "Test", "body") if err == nil { t.Fatal("expected error, got nil") @@ -97,7 +101,7 @@ func TestSlack_SendHTTPError(t *testing.T) { } func TestSlack_SendConnectionError(t *testing.T) { - n := New(Config{WebhookURL: "http://127.0.0.1:1"}) + n := newForTest(Config{WebhookURL: "http://127.0.0.1:1"}) err := n.Send(context.Background(), "", "Test", "body") if err == nil { t.Fatal("expected connection error, got nil") diff --git a/internal/connector/notifier/teams/teams.go b/internal/connector/notifier/teams/teams.go index 60a7132..a784bc8 100644 --- a/internal/connector/notifier/teams/teams.go +++ b/internal/connector/notifier/teams/teams.go @@ -8,8 +8,15 @@ import ( "io" "net/http" "time" + + "github.com/certctl-io/certctl/internal/validation" ) +// teamsClientTimeout bounds every outbound Teams webhook request and +// its resolution/dial phase. Shared by the SSRF-safe transport dialer +// (Bundle 5 R7 closure) and the http.Client. +const teamsClientTimeout = 10 * time.Second + // Config holds configuration for the Microsoft Teams notifier. type Config struct { // WebhookURL is the Teams incoming webhook URL. @@ -23,11 +30,35 @@ type Notifier struct { } // New creates a new Teams notifier. +// +// Bundle 5 closure (audit R7): SSRF-safe transport — see the parallel +// rationale in internal/connector/notifier/slack.New. Webhook URLs are +// operator-configured via the dynamic-config GUI and must not pivot +// into cloud metadata services or in-cluster reserved ranges. func New(config Config) *Notifier { + transport := &http.Transport{ + DialContext: validation.SafeHTTPDialContext(teamsClientTimeout), + MaxIdleConns: 10, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + } return &Notifier{ config: config, httpClient: &http.Client{ - Timeout: 10 * time.Second, + Timeout: teamsClientTimeout, + Transport: transport, + }, + } +} + +// newForTest bypasses the SSRF dial-time guard for unit tests that hit +// httptest.NewServer (binds to 127.0.0.1). Production uses `New`. +func newForTest(config Config) *Notifier { + return &Notifier{ + config: config, + httpClient: &http.Client{ + Timeout: teamsClientTimeout, }, } } diff --git a/internal/connector/notifier/teams/teams_test.go b/internal/connector/notifier/teams/teams_test.go index a1385f8..120548c 100644 --- a/internal/connector/notifier/teams/teams_test.go +++ b/internal/connector/notifier/teams/teams_test.go @@ -34,7 +34,8 @@ func TestTeams_SendSuccess(t *testing.T) { })) defer server.Close() - n := New(Config{WebhookURL: server.URL}) + // Bundle 5 R7: bypass the SSRF dial-time guard for httptest's 127.0.0.1 server. + n := newForTest(Config{WebhookURL: server.URL}) err := n.Send(context.Background(), "team@example.com", "Renewal Failed", "Certificate mc-api-prod renewal failed after 3 attempts") if err != nil { t.Fatalf("unexpected error: %v", err) @@ -70,7 +71,7 @@ func TestTeams_SendHTTPError(t *testing.T) { })) defer server.Close() - n := New(Config{WebhookURL: server.URL}) + n := newForTest(Config{WebhookURL: server.URL}) err := n.Send(context.Background(), "", "Test", "body") if err == nil { t.Fatal("expected error, got nil") @@ -81,7 +82,7 @@ func TestTeams_SendHTTPError(t *testing.T) { } func TestTeams_SendConnectionError(t *testing.T) { - n := New(Config{WebhookURL: "http://127.0.0.1:1"}) + n := newForTest(Config{WebhookURL: "http://127.0.0.1:1"}) err := n.Send(context.Background(), "", "Test", "body") if err == nil { t.Fatal("expected connection error, got nil")