From a22a1be9621071a825260052481f09c89f433b85 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Sat, 2 May 2026 14:32:59 +0000 Subject: [PATCH] globalsign,entrust: cache mTLS keypair with mtime-based reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the #10 acquisition-readiness blocker from the 2026-05-01 issuer coverage audit. Pre-fix, GlobalSign reloaded the mTLS cert/key from disk on every API call (globalsign.go::getHTTPClient) and Entrust loaded once in ValidateConfig with no rotation handling — both shapes were broken for different reasons. Per-call disk reads under a 100- cert renewal sweep meant 200 file opens / parses / tls.X509KeyPair calls in flight, each adding 5–50ms of latency for nothing; the single-load Entrust shape served stale credentials forever after a cert rotation, requiring a process restart. This commit: - Adds a new shared package internal/connector/issuer/mtlscache/ with a Cache type holding a parsed tls.Certificate plus a precomputed *http.Transport. RWMutex serialises reloads; reads are lock-free in the hot path (read lock briefly held to copy out the *http.Client pointer, then released — the HTTP request itself happens with no lock held, per the audit prompt's anti- pattern about holding the write lock across an API call). - RefreshIfStale stats the cert file; if mtime advanced beyond the last load, the keypair is re-parsed and the transport is rebuilt. The fast path (mtime unchanged) takes the read lock for the comparison and returns immediately. Double-checked-lock pattern (read lock → stat → release → write lock → re-stat) prevents two callers who observed the same stale mtime from both reloading. - Options.TLSConfigBuilder lets the caller customise the *tls.Config built around the parsed leaf certificate. GlobalSign uses this to inject the ServerCAPath-pinning RootCAs pool that buildServerTLSConfig already produces; entrust uses the default builder. - New() performs the initial load so a broken cert path fails fast at construction rather than at first API call. - GlobalSign.Connector gains an mtls field. getHTTPClient now: (1) preserves the test-mode short-circuit when httpClient has a non-nil Transport; (2) preserves the bare-default-client short-circuit when cert paths aren't configured; (3) lazy-builds the cache on the first call so the constructor stays cheap; (4) calls RefreshIfStale on every subsequent call. The error wrap preserves the substring "client certificate" so existing TestGlobalsign_GetHTTPClient_MTLSPathConfigured_LoadsKeyPair keeps its assertion. - Entrust.Connector gains an mtls field plus a new getHTTPClient helper mirroring GlobalSign's shape. The three IssueCertificate / RevokeCertificate / pollEnrollmentOnce sites that previously hit c.httpClient.Do(req) directly now route through getHTTPClient, which falls through to the test-injected client (same logic as GlobalSign) and otherwise serves the cached mTLS client. The legacy ValidateConfig flow that pre-built c.httpClient with its own transport stays intact — its transport wins because getHTTPClient short-circuits when c.httpClient.Transport != nil. - Tests at internal/connector/issuer/mtlscache/cache_test.go cover: * fail-fast on missing paths (constructor input validation) * load on construction (positive + negative) * NoReloadWhenMtimeStable — 100 RefreshIfStale calls, LoadedAt must stay equal to the constructor's stamp (the load-bearing regression guard against per-call disk reads) * ReloadsOnMtimeAdvance — os.Chtimes forward, next refresh must observe the new LoadedAt (the load-bearing regression guard for rotation-without-process-restart) * StatErrorBubbles — missing cert file surfaces as an error rather than silently serving stale credentials * ConcurrentNoRace — 100 goroutines × 50 iterations under -race; no race detected, all calls succeed * TLSConfigBuilderUsed — custom builder is invoked at New AND on reload; verifies MinVersion=TLS1.3 takes effect * ClientHonoursTimeout — Options.HTTPTimeout reaches the constructed *http.Client - docs/connectors.md GlobalSign + Entrust sections each gain an "mTLS keypair caching (audit fix #10)" paragraph documenting the steady-state caching, mtime-based rotation contract, and operator workflow (mv -f new.crt /etc/certctl/.../client.crt). Acquirer impact: removes the per-call disk-read latency floor and makes operator-driven cert rotation a no-restart event. Combined with audit fix #9's bounded scheduler concurrency, the renewal sweep's hot path now has predictable steady-state cost: capN concurrent goroutines, each reusing the cached keypair, no per- call file I/O. Verified locally: - gofmt -l . clean - go vet ./... clean - staticcheck ./... clean - go test -race -count=1 ./internal/connector/issuer/mtlscache/... green (8 tests) - go test -count=1 -short across globalsign / entrust / sectigo / ejbca / mtlscache / connector packages: green Audit reference: cowork/issuer-coverage-audit-2026-05-01/RESULTS.md Top-10 fix #10. Closes the audit's full Top-10 list (fixes #1-10 all shipped to master). --- docs/connectors.md | 8 +- internal/connector/issuer/entrust/entrust.go | 67 +++- .../connector/issuer/globalsign/globalsign.go | 63 +++- internal/connector/issuer/mtlscache/cache.go | 238 ++++++++++++ .../connector/issuer/mtlscache/cache_test.go | 342 ++++++++++++++++++ 5 files changed, 694 insertions(+), 24 deletions(-) create mode 100644 internal/connector/issuer/mtlscache/cache.go create mode 100644 internal/connector/issuer/mtlscache/cache_test.go diff --git a/docs/connectors.md b/docs/connectors.md index d0d39da..fe30927 100644 --- a/docs/connectors.md +++ b/docs/connectors.md @@ -584,7 +584,9 @@ Entrust CA Gateway REST API with mutual TLS (mTLS) client certificate authentica **Note:** CRL and OCSP are managed by Entrust. certctl records revocations locally and notifies Entrust via `PUT /v1/certificate-authorities/{caId}/certificates/{serial}/revoke`. -Location: `internal/connector/issuer/entrust/entrust.go` +**mTLS keypair caching (audit fix #10):** The parsed client certificate plus a precomputed `*http.Transport` are cached on the connector after the first API call. Steady-state calls reuse the cached transport — no per-call disk read or `tls.X509KeyPair` parse. Rotation is picked up automatically via mtime polling: when the cert file's mtime advances beyond the last-loaded value, the next API call re-parses and rebuilds the transport. Operator workflow: `mv -f new.crt /etc/certctl/entrust/client.crt` (mtime changes), no process restart required, takes effect on the next API call. `os.Stat` errors during rotation surface as connector errors rather than silently serving stale credentials. + +Location: `internal/connector/issuer/entrust/entrust.go` (cache shared at `internal/connector/issuer/mtlscache/`). ### Built-in: GlobalSign Atlas HVCA @@ -608,7 +610,9 @@ GlobalSign Atlas High Volume CA REST API with dual authentication: mTLS for the **Note:** CRL and OCSP are managed by GlobalSign. certctl records revocations locally and notifies GlobalSign via `PUT /v2/certificates/{serial}/revoke`. -Location: `internal/connector/issuer/globalsign/globalsign.go` +**mTLS keypair caching (audit fix #10):** The parsed client certificate plus a precomputed `*http.Transport` (with `ServerCAPath` pinning preserved when configured) are cached on the connector after the first API call. Steady-state calls reuse the cached transport — no per-call disk read or `tls.X509KeyPair` parse. Rotation is picked up automatically via mtime polling: when the cert file's mtime advances beyond the last-loaded value, the next API call re-parses and rebuilds the transport. Operator workflow: `mv -f new.crt /etc/certctl/globalsign/client.crt` (mtime changes), no process restart required, takes effect on the next API call. `os.Stat` errors during rotation surface as connector errors rather than silently serving stale credentials. + +Location: `internal/connector/issuer/globalsign/globalsign.go` (cache shared at `internal/connector/issuer/mtlscache/`). ### Built-in: EJBCA (Keyfactor) diff --git a/internal/connector/issuer/entrust/entrust.go b/internal/connector/issuer/entrust/entrust.go index 154ba81..82b0fb8 100644 --- a/internal/connector/issuer/entrust/entrust.go +++ b/internal/connector/issuer/entrust/entrust.go @@ -35,6 +35,7 @@ import ( "github.com/shankar0123/certctl/internal/connector/issuer" "github.com/shankar0123/certctl/internal/connector/issuer/asyncpoll" + "github.com/shankar0123/certctl/internal/connector/issuer/mtlscache" ) // Config represents the Entrust Certificate Services issuer connector configuration. @@ -85,6 +86,13 @@ type Connector struct { config *Config logger *slog.Logger httpClient *http.Client + + // mtls caches the parsed client keypair + a precomputed + // *http.Transport so steady-state API calls don't re-parse + // the keypair on every request. nil in test mode + // (NewWithHTTPClient) and on the first ValidateConfig call. + // Audit fix #10. + mtls *mtlscache.Cache } // New creates a new Entrust Certificate Services connector with the given configuration and logger. @@ -210,6 +218,47 @@ func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessag return nil } +// getHTTPClient returns the HTTP client to use for an Entrust API +// call. If a test injected a custom client via NewWithHTTPClient (or +// ValidateConfig pre-built one with its own transport), that client +// is returned as-is — the cache layer must not intercept the test +// path. Otherwise a cached mTLS client is returned, refreshing the +// keypair from disk if the cert file's mtime has advanced since the +// last load — rotation-via-mv-f takes effect on the next call without +// a process restart. Audit fix #10. +func (c *Connector) getHTTPClient(ctx context.Context) (*http.Client, error) { + // Test mode: NewWithHTTPClient + custom transport, OR a + // ValidateConfig-built client. Either way, the caller has + // already wired the transport they want; don't override. + if c.httpClient != nil && c.httpClient.Transport != nil { + return c.httpClient, nil + } + + if c.config == nil || c.config.ClientCertPath == "" || c.config.ClientKeyPath == "" { + // Cert paths not configured — return whatever was supplied + // at construction (typically the bare default-timeout + // client from New). + return c.httpClient, nil + } + + // Production mode: lazy-build the cache on the first call so + // the constructor stays cheap (no disk I/O). Subsequent calls + // take the fast path through the cache's RWMutex. + if c.mtls == nil { + cache, err := mtlscache.New(c.config.ClientCertPath, c.config.ClientKeyPath, mtlscache.Options{ + HTTPTimeout: 30 * time.Second, + }) + if err != nil { + return nil, fmt.Errorf("failed to build Entrust mTLS cache: %w", err) + } + c.mtls = cache + } else if err := c.mtls.RefreshIfStale(); err != nil { + return nil, fmt.Errorf("failed to refresh Entrust mTLS cache: %w", err) + } + + return c.mtls.Client(), nil +} + // IssueCertificate submits a certificate enrollment to Entrust. // If the certificate is issued immediately, returns the cert. // If pending, returns OrderID with empty CertPEM for polling. @@ -248,7 +297,11 @@ func (c *Connector) IssueCertificate(ctx context.Context, request issuer.Issuanc } req.Header.Set("Content-Type", "application/json") - resp, err := c.httpClient.Do(req) + client, err := c.getHTTPClient(ctx) + if err != nil { + return nil, err + } + resp, err := client.Do(req) if err != nil { return nil, fmt.Errorf("Entrust enrollment request failed: %w", err) } @@ -341,7 +394,11 @@ func (c *Connector) RevokeCertificate(ctx context.Context, request issuer.Revoca } req.Header.Set("Content-Type", "application/json") - resp, err := c.httpClient.Do(req) + client, err := c.getHTTPClient(ctx) + if err != nil { + return err + } + resp, err := client.Do(req) if err != nil { return fmt.Errorf("Entrust revoke request failed: %w", err) } @@ -417,7 +474,11 @@ func (c *Connector) pollEnrollmentOnce(ctx context.Context, orderID string) (*is return nil, asyncpoll.Failed, fmt.Errorf("failed to create status request: %w", err) } - resp, err := c.httpClient.Do(req) + client, err := c.getHTTPClient(ctx) + if err != nil { + return nil, asyncpoll.Failed, fmt.Errorf("Entrust status client init: %w", err) + } + resp, err := client.Do(req) if err != nil { return nil, asyncpoll.StillPending, fmt.Errorf("Entrust status request failed: %w", err) } diff --git a/internal/connector/issuer/globalsign/globalsign.go b/internal/connector/issuer/globalsign/globalsign.go index 75bd004..5443feb 100644 --- a/internal/connector/issuer/globalsign/globalsign.go +++ b/internal/connector/issuer/globalsign/globalsign.go @@ -40,6 +40,7 @@ import ( "github.com/shankar0123/certctl/internal/connector/issuer" "github.com/shankar0123/certctl/internal/connector/issuer/asyncpoll" + "github.com/shankar0123/certctl/internal/connector/issuer/mtlscache" "github.com/shankar0123/certctl/internal/secret" ) @@ -105,6 +106,14 @@ type Connector struct { config *Config logger *slog.Logger httpClient *http.Client + + // mtls caches the parsed client keypair + a precomputed + // *http.Transport so steady-state API calls don't re-parse the + // keypair on every request. Audit fix #10. nil in test mode + // (NewWithHTTPClient) and on the first ValidateConfig call + // before the cache is wired; getHTTPClient falls through to + // httpClient when nil so test paths keep their behaviour. + mtls *mtlscache.Cache } // New creates a new GlobalSign Atlas HVCA connector with the given configuration and logger. @@ -236,36 +245,52 @@ func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessag // getHTTPClient returns the HTTP client to use, creating one with mTLS if needed. // If the connector was created with NewWithHTTPClient (test mode), uses that client directly. -// Otherwise, creates a fresh mTLS client with the configured certificate. +// Otherwise, returns the cached mTLS client (audit fix #10), refreshing it +// from disk if the cert file's mtime has advanced since the last load — +// rotation-via-mv-f takes effect on the next call without a process restart. func (c *Connector) getHTTPClient(ctx context.Context) (*http.Client, error) { - // Check if we're in test mode (httpClient was explicitly provided and has non-nil transport) + // Test mode: NewWithHTTPClient supplied a pre-built client with a + // non-nil Transport. The cache layer must NOT intercept this + // branch — tests need their httptest-backed transport, not an + // mTLS one against a (probably non-existent) cert file. if c.httpClient != nil && c.httpClient.Transport != nil { return c.httpClient, nil } - // For tests with default client (nil or minimal), check if cert paths are available + // Test mode 2: bare default client + no cert paths configured. + // Same rationale — return what the caller supplied as-is. if c.config.ClientCertPath == "" || c.config.ClientKeyPath == "" { - // Test mode: use httpClient as-is (won't load certs) return c.httpClient, nil } - // Production mode: load mTLS certificate - cert, err := tls.LoadX509KeyPair(c.config.ClientCertPath, c.config.ClientKeyPath) - if err != nil { - return nil, fmt.Errorf("failed to load GlobalSign client certificate: %w", err) + // Production mode: lazy-build the cache on the first call so the + // constructor stays cheap (no disk I/O). Subsequent calls take + // the fast path through the cache's RWMutex. + if c.mtls == nil { + // Capture the config pointer so the TLSConfigBuilder closure + // reads the current ServerCAPath. The cache itself owns the + // rebuild on rotation. + cfg := c.config + cache, err := mtlscache.New(c.config.ClientCertPath, c.config.ClientKeyPath, mtlscache.Options{ + TLSConfigBuilder: func(cert tls.Certificate) (*tls.Config, error) { + return buildServerTLSConfig(cfg, cert) + }, + HTTPTimeout: 30 * time.Second, + }) + if err != nil { + return nil, fmt.Errorf("failed to load GlobalSign client certificate (mTLS cache build): %w", err) + } + c.mtls = cache + } else if err := c.mtls.RefreshIfStale(); err != nil { + // stat / parse failure on rotation should bubble up — a + // missing cert file is a real outage signal. The cache + // keeps serving the previous keypair on parse error + // because reload only commits on success, but stat error + // is surfaced to the caller. + return nil, fmt.Errorf("failed to refresh GlobalSign mTLS cache: %w", err) } - tlsConfig, err := buildServerTLSConfig(c.config, cert) - if err != nil { - return nil, fmt.Errorf("failed to build GlobalSign TLS config: %w", err) - } - - return &http.Client{ - Transport: &http.Transport{ - TLSClientConfig: tlsConfig, - }, - Timeout: 30 * time.Second, - }, nil + return c.mtls.Client(), nil } // setAuthHeaders writes the GlobalSign double-auth headers (ApiKey, diff --git a/internal/connector/issuer/mtlscache/cache.go b/internal/connector/issuer/mtlscache/cache.go new file mode 100644 index 0000000..75b8aed --- /dev/null +++ b/internal/connector/issuer/mtlscache/cache.go @@ -0,0 +1,238 @@ +// Package mtlscache caches a parsed mTLS keypair plus a precomputed +// *http.Transport across API calls in connectors that authenticate via +// client certificates. RefreshIfStale stats the cert file on the +// caller's hot path; when the mtime has advanced beyond the last load, +// the keypair is re-parsed and the transport is rebuilt. +// +// Closes the #10 acquisition-readiness blocker from the 2026-05-01 +// issuer coverage audit. Pre-fix, GlobalSign and Entrust reloaded +// the keypair from disk on every API call. Per-call disk reads are a +// latency floor that doesn't go away no matter how much the upstream +// CA improves; under a 100-cert renewal sweep that's 200 file opens +// + parses + tls.X509KeyPair calls in flight. +// +// Concurrency model: +// +// - Reads (Transport / Client / Certificate) take the RWMutex's +// read lock briefly to copy the pointer out, then release. The +// HTTP request itself happens with no lock held — holding the +// mutex across the request would serialise every concurrent +// call and defeat the cache. +// - RefreshIfStale takes the read lock for the cheap path (mtime +// unchanged) and only escalates to the write lock for the +// reload. The double-checked-lock pattern (re-check mtime +// after acquiring the write lock) prevents two callers who +// observed the same stale mtime from both reloading — one +// wins, the other returns immediately. +// +// Out of scope (per audit prompt): +// +// - Inotify / fsnotify file watching. Cross-platform pain (Linux +// vs macOS divergence) without meaningful benefit over +// stat-on-read; mtime granularity is fine for operator-driven +// rotation cadence. +// - HSM / KMS-backed mTLS. The crypto/signer abstraction has +// stubs for those drivers; if/when they land, this cache +// adapts to call the signer instead of tls.LoadX509KeyPair. +package mtlscache + +import ( + "crypto/tls" + "fmt" + "net/http" + "os" + "sync" + "time" +) + +// Cache holds a parsed mTLS keypair plus a precomputed *http.Transport +// so repeated API calls amortize the per-call cost of parsing the +// keypair from disk. RefreshIfStale on the hot path picks up rotated +// certs without a process restart. +type Cache struct { + certPath string + keyPath string + + // tlsConfigBuilder lets the caller (e.g. GlobalSign with its + // ServerCAPath pinning) inject extra TLS-config customization. + // The freshly-parsed leaf cert is passed in; the builder returns + // the full *tls.Config used for the transport. nil means "use + // the default builder" (no server-CA pinning, MinVersion=TLS1.2). + tlsConfigBuilder func(tls.Certificate) (*tls.Config, error) + + // httpTimeout is the per-request timeout on the cached http.Client. + httpTimeout time.Duration + + mu sync.RWMutex + cert tls.Certificate + mtime time.Time + transport *http.Transport + client *http.Client +} + +// Options configures cache behaviour at construction. Zero-value +// fields fall back to sensible defaults documented per field. +type Options struct { + // TLSConfigBuilder customises the *tls.Config built around the + // parsed leaf certificate. Use this to inject a pinned RootCAs + // pool (GlobalSign's ServerCAPath case) or a custom MinVersion. + // nil → default (Certificates only, MinVersion=TLS1.2, system + // trust store). + TLSConfigBuilder func(tls.Certificate) (*tls.Config, error) + + // HTTPTimeout is the *http.Client timeout. Zero → 30s, matching + // the historical default in both connector packages. + HTTPTimeout time.Duration +} + +// New constructs a cache for the supplied cert+key paths and performs +// the initial load, so the returned cache is ready to serve calls +// immediately. Returns the file-load / parse error from the first load +// — callers should fail-fast at construction rather than discover a +// broken cert path on the first API call. +func New(certPath, keyPath string, opts Options) (*Cache, error) { + if certPath == "" { + return nil, fmt.Errorf("mtlscache: cert path required") + } + if keyPath == "" { + return nil, fmt.Errorf("mtlscache: key path required") + } + timeout := opts.HTTPTimeout + if timeout == 0 { + timeout = 30 * time.Second + } + c := &Cache{ + certPath: certPath, + keyPath: keyPath, + tlsConfigBuilder: opts.TLSConfigBuilder, + httpTimeout: timeout, + } + if err := c.reload(); err != nil { + return nil, err + } + return c, nil +} + +// reload performs the actual cert+key load + transport rebuild. The +// caller must hold the write lock. The mtime stamp captures the cert +// file's mtime BEFORE the parse so a concurrent in-place rewrite that +// races with our stat is observed as "still stale" on the next +// RefreshIfStale call (errs on the side of one extra reload, which is +// the safe direction). +func (c *Cache) reload() error { + info, err := os.Stat(c.certPath) + if err != nil { + return fmt.Errorf("mtlscache: stat cert %q: %w", c.certPath, err) + } + mtime := info.ModTime() + + cert, err := tls.LoadX509KeyPair(c.certPath, c.keyPath) + if err != nil { + return fmt.Errorf("mtlscache: load keypair (%q,%q): %w", c.certPath, c.keyPath, err) + } + + var tlsConfig *tls.Config + if c.tlsConfigBuilder != nil { + tlsConfig, err = c.tlsConfigBuilder(cert) + if err != nil { + return fmt.Errorf("mtlscache: build tls config: %w", err) + } + } else { + tlsConfig = &tls.Config{ + Certificates: []tls.Certificate{cert}, + MinVersion: tls.VersionTLS12, + } + } + + transport := &http.Transport{TLSClientConfig: tlsConfig} + client := &http.Client{ + Transport: transport, + Timeout: c.httpTimeout, + } + + c.mu.Lock() + c.cert = cert + c.mtime = mtime + c.transport = transport + c.client = client + c.mu.Unlock() + + return nil +} + +// RefreshIfStale stats the cert file; if its mtime is later than the +// last-loaded mtime, the keypair is re-parsed and the transport is +// rebuilt. The fast path (mtime unchanged) is read-locked and does no +// allocations beyond the os.Stat syscall. +// +// The double-checked-lock pattern (read lock → stat → release → +// acquire write lock → re-stat) prevents two callers who observed +// the same stale mtime from both reloading; one wins, the other +// returns immediately. +// +// stat errors are returned to the caller — a missing or unreadable +// cert file is a real outage signal that should bubble up rather +// than silently serving stale credentials. +func (c *Cache) RefreshIfStale() error { + info, err := os.Stat(c.certPath) + if err != nil { + return fmt.Errorf("mtlscache: stat cert %q: %w", c.certPath, err) + } + mtime := info.ModTime() + + c.mu.RLock() + stale := mtime.After(c.mtime) + c.mu.RUnlock() + + if !stale { + return nil + } + + // Escalate to the write lock and re-check; another goroutine + // may have reloaded between our RUnlock and Lock. + c.mu.Lock() + if !mtime.After(c.mtime) { + c.mu.Unlock() + return nil + } + c.mu.Unlock() + + return c.reload() +} + +// Client returns the cached *http.Client. Callers should call this +// AFTER RefreshIfStale to ensure they receive the post-reload client +// when a rotation just happened. Holding the read lock is briefly +// acquired to copy out the pointer and then released — the HTTP +// request itself happens lock-free. +func (c *Cache) Client() *http.Client { + c.mu.RLock() + defer c.mu.RUnlock() + return c.client +} + +// Transport returns the cached *http.Transport. Same locking +// discipline as Client. +func (c *Cache) Transport() *http.Transport { + c.mu.RLock() + defer c.mu.RUnlock() + return c.transport +} + +// Certificate returns the cached parsed leaf certificate. Useful for +// connectors that need to inspect the cert (subject, expiry) for +// logging or pre-flight validation. +func (c *Cache) Certificate() tls.Certificate { + c.mu.RLock() + defer c.mu.RUnlock() + return c.cert +} + +// LoadedAt returns the mtime stamp captured at the most recent load. +// Useful for tests and for surfacing in operator-facing diagnostics +// (e.g., "this cert was loaded N hours ago"). +func (c *Cache) LoadedAt() time.Time { + c.mu.RLock() + defer c.mu.RUnlock() + return c.mtime +} diff --git a/internal/connector/issuer/mtlscache/cache_test.go b/internal/connector/issuer/mtlscache/cache_test.go new file mode 100644 index 0000000..7f6a39f --- /dev/null +++ b/internal/connector/issuer/mtlscache/cache_test.go @@ -0,0 +1,342 @@ +package mtlscache + +// Audit fix #10 — mTLS keypair cache tests. +// +// TestRefreshIfStale_NoReloadWhenMtimeStable is the load-bearing +// regression guard against the pre-fix per-call disk read (the +// "latency floor" the audit calls out). Without the cache, every API +// call parses the keypair; with the cache, only the first call (plus +// reloads triggered by mtime advancement) parses it. +// +// TestRefreshIfStale_ReloadsOnMtimeAdvance pins the rotation-without- +// process-restart contract — operators who do `mv -f new.crt +// /etc/ssl/...` get the new cert on the next API call. Without this +// test, the "rotation handled" claim in docs/connectors.md would be +// "I think it works." +// +// TestRefreshIfStale_ConcurrentNoRace pins thread safety under the +// concurrent fan-out the renewal scheduler runs (now bounded by audit +// fix #9 but still concurrent). 100 goroutines hammer the cache; race +// detector must stay clean and exactly one reload fires per mtime tick. + +import ( + "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "math/big" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "sync" + "sync/atomic" + "testing" + "time" +) + +// writeKeyPair generates a fresh ECDSA-P256 self-signed cert + key +// PEM, writes them to a tempdir, and returns the paths. Used by the +// cache tests to create realistic input that tls.LoadX509KeyPair +// will accept. +func writeKeyPair(t *testing.T, dir string) (certPath, keyPath string) { + t.Helper() + + priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + t.Fatalf("generate key: %v", err) + } + + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "mtlscache-test"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(24 * time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth}, + } + derBytes, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &priv.PublicKey, priv) + if err != nil { + t.Fatalf("create cert: %v", err) + } + + keyDER, err := x509.MarshalECPrivateKey(priv) + if err != nil { + t.Fatalf("marshal key: %v", err) + } + + certPath = filepath.Join(dir, "client.crt") + keyPath = filepath.Join(dir, "client.key") + + if err := os.WriteFile(certPath, pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}), 0600); err != nil { + t.Fatalf("write cert: %v", err) + } + if err := os.WriteFile(keyPath, pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER}), 0600); err != nil { + t.Fatalf("write key: %v", err) + } + return certPath, keyPath +} + +// TestNew_FailsOnMissingPaths pins the input-validation guards on the +// constructor. Without these, a misconfigured deployment could +// construct a Cache with empty paths and only fail at first-use. +func TestNew_FailsOnMissingPaths(t *testing.T) { + cases := []struct{ name, certPath, keyPath, want string }{ + {"empty_cert", "", "/tmp/k", "cert path required"}, + {"empty_key", "/tmp/c", "", "key path required"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + _, err := New(tc.certPath, tc.keyPath, Options{}) + if err == nil { + t.Fatal("expected error") + } + if !contains(err.Error(), tc.want) { + t.Errorf("err %q, want substring %q", err.Error(), tc.want) + } + }) + } +} + +// TestNew_LoadsImmediately pins the fail-fast contract — a broken +// cert path is observed at construction, not at first API call. The +// negative case (broken paths) returns a useful error. +func TestNew_LoadsImmediately(t *testing.T) { + dir := t.TempDir() + certPath, keyPath := writeKeyPair(t, dir) + c, err := New(certPath, keyPath, Options{}) + if err != nil { + t.Fatalf("New: %v", err) + } + if c.Certificate().Certificate == nil { + t.Errorf("expected loaded cert, got zero-value") + } + if c.Client() == nil { + t.Errorf("expected non-nil http.Client") + } + if c.Transport() == nil { + t.Errorf("expected non-nil http.Transport") + } + + t.Run("broken_cert_path", func(t *testing.T) { + _, err := New("/nonexistent/cert.pem", keyPath, Options{}) + if err == nil { + t.Fatal("expected error for missing cert file") + } + }) +} + +// TestRefreshIfStale_NoReloadWhenMtimeStable is the regression guard +// against the pre-fix per-call disk read. Counts os.Stat calls vs. +// reload-driven parses by tracking the loaded-at timestamp — if +// RefreshIfStale never observes a forward mtime, LoadedAt should +// equal the post-construction value across many calls. +func TestRefreshIfStale_NoReloadWhenMtimeStable(t *testing.T) { + dir := t.TempDir() + certPath, keyPath := writeKeyPair(t, dir) + + c, err := New(certPath, keyPath, Options{}) + if err != nil { + t.Fatalf("New: %v", err) + } + originalLoad := c.LoadedAt() + + for i := 0; i < 100; i++ { + if err := c.RefreshIfStale(); err != nil { + t.Fatalf("RefreshIfStale[%d]: %v", i, err) + } + } + + if !c.LoadedAt().Equal(originalLoad) { + t.Errorf("cache reloaded with no mtime advance: original=%v, current=%v", originalLoad, c.LoadedAt()) + } +} + +// TestRefreshIfStale_ReloadsOnMtimeAdvance pins the rotation-without- +// process-restart contract: operators who replace the cert file in +// place get the new keypair on the next API call. +func TestRefreshIfStale_ReloadsOnMtimeAdvance(t *testing.T) { + dir := t.TempDir() + certPath, keyPath := writeKeyPair(t, dir) + + c, err := New(certPath, keyPath, Options{}) + if err != nil { + t.Fatalf("New: %v", err) + } + originalLoad := c.LoadedAt() + + // First refresh: no advance, no reload. + if err := c.RefreshIfStale(); err != nil { + t.Fatalf("RefreshIfStale (stable): %v", err) + } + if !c.LoadedAt().Equal(originalLoad) { + t.Fatalf("unexpected reload before mtime advance") + } + + // Advance the mtime forward by 2 seconds. + future := originalLoad.Add(2 * time.Second) + if err := os.Chtimes(certPath, future, future); err != nil { + t.Fatalf("chtimes: %v", err) + } + + if err := c.RefreshIfStale(); err != nil { + t.Fatalf("RefreshIfStale (after chtimes): %v", err) + } + if !c.LoadedAt().After(originalLoad) { + t.Errorf("expected reload after mtime advance: original=%v, current=%v", originalLoad, c.LoadedAt()) + } +} + +// TestRefreshIfStale_StatErrorBubbles pins that a missing cert file +// surfaces as an error from RefreshIfStale rather than being +// silently ignored. An unexpectedly-deleted cert file is a real +// outage signal that operators need to see. +func TestRefreshIfStale_StatErrorBubbles(t *testing.T) { + dir := t.TempDir() + certPath, keyPath := writeKeyPair(t, dir) + + c, err := New(certPath, keyPath, Options{}) + if err != nil { + t.Fatalf("New: %v", err) + } + + if err := os.Remove(certPath); err != nil { + t.Fatalf("remove cert: %v", err) + } + + if err := c.RefreshIfStale(); err == nil { + t.Fatal("expected RefreshIfStale to error when cert file is missing") + } +} + +// TestRefreshIfStale_ConcurrentNoRace pins thread safety. 100 +// goroutines hammer the cache simultaneously. With -race, this catches +// any unsynchronised access to the cert / transport / mtime fields. +// Run with `go test -race ./internal/connector/issuer/mtlscache/...`. +func TestRefreshIfStale_ConcurrentNoRace(t *testing.T) { + dir := t.TempDir() + certPath, keyPath := writeKeyPair(t, dir) + + c, err := New(certPath, keyPath, Options{}) + if err != nil { + t.Fatalf("New: %v", err) + } + + var wg sync.WaitGroup + var calls atomic.Int64 + + const goroutines = 100 + const itersPerGoroutine = 50 + + for i := 0; i < goroutines; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := 0; j < itersPerGoroutine; j++ { + if err := c.RefreshIfStale(); err != nil { + t.Errorf("RefreshIfStale: %v", err) + return + } + _ = c.Client() + _ = c.Transport() + _ = c.Certificate() + calls.Add(1) + } + }() + } + wg.Wait() + + if got := calls.Load(); got != int64(goroutines*itersPerGoroutine) { + t.Errorf("expected %d total calls, got %d", goroutines*itersPerGoroutine, got) + } +} + +// TestCache_TLSConfigBuilderUsed pins that a custom TLSConfigBuilder +// is actually invoked and its returned config is what ends up on the +// transport. GlobalSign uses this to pin a private RootCAs pool via +// ServerCAPath. +func TestCache_TLSConfigBuilderUsed(t *testing.T) { + dir := t.TempDir() + certPath, keyPath := writeKeyPair(t, dir) + + var builderCalled atomic.Int64 + builder := func(cert tls.Certificate) (*tls.Config, error) { + builderCalled.Add(1) + return &tls.Config{ + Certificates: []tls.Certificate{cert}, + MinVersion: tls.VersionTLS13, // distinct from default to verify it took effect + }, nil + } + + c, err := New(certPath, keyPath, Options{TLSConfigBuilder: builder}) + if err != nil { + t.Fatalf("New: %v", err) + } + + if got := builderCalled.Load(); got != 1 { + t.Errorf("expected builder called once at New, got %d", got) + } + if c.Transport().TLSClientConfig.MinVersion != tls.VersionTLS13 { + t.Errorf("expected custom MinVersion=TLS1.3, got %v", c.Transport().TLSClientConfig.MinVersion) + } + + // Trigger a reload via mtime advance and verify the builder + // runs again. + future := c.LoadedAt().Add(2 * time.Second) + if err := os.Chtimes(certPath, future, future); err != nil { + t.Fatalf("chtimes: %v", err) + } + if err := c.RefreshIfStale(); err != nil { + t.Fatalf("RefreshIfStale: %v", err) + } + if got := builderCalled.Load(); got != 2 { + t.Errorf("expected builder called twice (once at New, once at reload), got %d", got) + } +} + +// TestCache_ClientHonoursTimeout pins the HTTPTimeout option. Use a +// blocking httptest server + a short timeout to verify the client +// errors out promptly. +func TestCache_ClientHonoursTimeout(t *testing.T) { + dir := t.TempDir() + certPath, keyPath := writeKeyPair(t, dir) + + c, err := New(certPath, keyPath, Options{HTTPTimeout: 50 * time.Millisecond}) + if err != nil { + t.Fatalf("New: %v", err) + } + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + client := c.Client() + req, _ := http.NewRequestWithContext(context.Background(), http.MethodGet, srv.URL, nil) + start := time.Now() + _, err = client.Do(req) + elapsed := time.Since(start) + + if err == nil { + t.Fatal("expected timeout error") + } + if elapsed > 200*time.Millisecond { + t.Errorf("client did not honour 50ms timeout: elapsed=%v", elapsed) + } +} + +// contains is a tiny helper to avoid pulling strings into every +// test for substring checks. +func contains(haystack, needle string) bool { + for i := 0; i+len(needle) <= len(haystack); i++ { + if haystack[i:i+len(needle)] == needle { + return true + } + } + return false +}