mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 20:01:31 +00:00
e3196e7b50
Final PR in the six-commit M-2 sequence (PR-A: CertificateService clustercdc9d03, PR-B: IssuerService+TargetServiceeb14236, PR-C: Policy/Profile/ Owner/Team2497be4, PR-D: Job/Notification/Auditccd89c3, PR-E: AgentService283ec27, PR-F: this commit). PR-A through PR-E collapsed the service-layer shim methods and deleted every in-production context.Background() / context.TODO() call from internal/service/; this PR completes the sweep across the non-service tiers (HTTP middleware + ACME connector) and wires the contextcheck linter so regressions fail CI. Three narrow edits land the D-3 pattern (context.WithoutCancel for subsidiary async writes and deferred shutdown contexts): - internal/api/middleware/audit.go -- async audit goroutine now runs on auditCtx := context.WithoutCancel(r.Context()) instead of context.Background(). Preserves request-scoped values (trace ID, auth) while detaching from the request's cancellation so the audit write does not get killed when the response completes. Goroutine is still tracked via a.wg (M-1 shutdown drain) so Flush(ctx) behaviour is unchanged. CWE-770 Missing Release (goroutine leak potential) + CWE-400 Resource Exhaustion (missed cancellation propagation). - internal/api/middleware/middleware.go -- Recovery panic path now logs via slog.ErrorContext(ctx, ...) instead of log.Printf. Request- scoped trace/auth metadata now carries through the panic log, matching every other request log. D-3 non-bypass: the context is r.Context() captured before the defer, so even a panic mid-handler propagates the ctx's trace ID into the ERROR log line. - internal/connector/issuer/acme/acme.go (HTTP-01 challenge server shutdown) -- defer shutdown context derived from context.WithTimeout(context.WithoutCancel(ctx), 5s) instead of context.Background(). Preserves parent ctx values, detaches from parent cancellation so Shutdown always gets its full 5-second budget even when the parent was cancelled. Matches the same pattern applied in ACME's solveAuthorizationsDNS01 and solveAuthorizationsDNSPersist01. Linter wiring: .golangci.yml adds `contextcheck` to the enabled set. golangci-lint v2.11.4 now fails CI on any function that takes a context.Context parameter but calls into context.Background() or context.TODO() instead of propagating -- regression guard for all five prior PRs. Verification (CI parity, GOCACHE=/tmp/gocache GOMODCACHE=/tmp/gomodcache GOLANGCI_LINT_CACHE=/tmp/lintcache): - go build ./... -> 0 - go vet ./... -> 0 - golangci-lint run (contextcheck enabled) -> 0 issues - go test -race -short ./internal/api/middleware/... -> PASS - go test -race -short ./internal/scheduler/... -> PASS - go test -race -short ./internal/connector/issuer/acme/... -> PASS - go test -race -short ./internal/service/... -> PASS - rg "context\.(Background|TODO)\(\)" internal/service/ internal/scheduler/ internal/connector/ internal/api/middleware/ -> 0 non-test hits (one pedagogical godoc reference in audit.go documenting why context.Background() would be wrong remains intentional) Wire-format invariants preserved: 0 API routes, 0 SQL migrations, 0 frontend bytes, 0 OpenAPI bytes, 0 connector interface signature changes, 0 new env vars, 0 new external dependencies (pure context stdlib). The AuditRecorder interface signature, the body-hash algorithm (SHA-256 16 hex chars), the excluded-path short-circuit, the actor-extraction path, the responseWriter status-capture wrapper, the AuditServiceAdapter, and all 116 API routes under /api/v1/, /.well-known/est/, /scep, /health, /auth are byte-identical. M-2 aggregate across PR-A through PR-F: 57 files, +635 / -613 (PR-A 12f +227/-237, PR-B 9f +150/-146, PR-C 17f +156/-148, PR-D 11f +67/-63, PR-E 4f +9/-15, PR-F 4f +26/-4). With M-2 closed, 8 of 10 Medium findings resolved; M-9, M-10, L-1..L-4, I-1..I-8 remain post-v2.1.0 hardening batch. Audit complete. Commit:1f6cf0eafa. Sections: 12. Findings: 2/7/10/4/6.
234 lines
8.8 KiB
Go
234 lines
8.8 KiB
Go
package middleware
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// AuditRecorder is the interface that the audit middleware uses to record API calls.
|
|
// This avoids importing the service package directly, maintaining dependency inversion.
|
|
//
|
|
// Implementations may perform I/O (e.g., database writes). The middleware invokes
|
|
// RecordAPICall from a tracked goroutine so that callers can drain in-flight
|
|
// recordings during graceful shutdown via AuditMiddleware.Flush.
|
|
type AuditRecorder interface {
|
|
RecordAPICall(ctx context.Context, method, path, actor string, bodyHash string, status int, latencyMs int64) error
|
|
}
|
|
|
|
// AuditConfig holds configuration for the API audit logging middleware.
|
|
type AuditConfig struct {
|
|
// ExcludePaths are path prefixes to skip audit logging (e.g., "/health", "/ready").
|
|
ExcludePaths []string
|
|
// Logger for audit middleware errors (audit recording failures shouldn't break requests).
|
|
Logger *slog.Logger
|
|
}
|
|
|
|
// ErrAuditFlushTimeout is returned by AuditMiddleware.Flush when in-flight audit
|
|
// recordings do not complete before the provided context is cancelled or its
|
|
// deadline elapses. It mirrors scheduler.ErrSchedulerShutdownTimeout so callers
|
|
// can branch on graceful-shutdown timeouts consistently across subsystems.
|
|
var ErrAuditFlushTimeout = errors.New("audit middleware flush timeout")
|
|
|
|
// AuditMiddleware is the handle returned by NewAuditLog. It wraps the audit
|
|
// logging HTTP middleware and tracks the goroutines spawned to record each API
|
|
// call, so that callers can drain them during graceful shutdown (M-1, CWE-662
|
|
// / CWE-400). The goroutines themselves still run detached from the request
|
|
// context — the shutdown-drain signal flows through this struct's WaitGroup
|
|
// instead of the per-request context.
|
|
type AuditMiddleware struct {
|
|
recorder AuditRecorder
|
|
logger *slog.Logger
|
|
excludeSet map[string]bool
|
|
|
|
// wg tracks every audit-recording goroutine spawned by Middleware so Flush
|
|
// can block until they complete before the DB pool is torn down.
|
|
wg sync.WaitGroup
|
|
}
|
|
|
|
// NewAuditLog constructs the API audit logging middleware. The returned
|
|
// *AuditMiddleware exposes the HTTP middleware via the Middleware method value
|
|
// (same func(http.Handler) http.Handler shape) and a Flush method that the
|
|
// process shutdown path must call after the HTTP server has stopped accepting
|
|
// new requests but before the audit recorder's backing store (e.g., the
|
|
// database connection pool) is closed.
|
|
//
|
|
// The middleware records method, path, authenticated actor, request body hash,
|
|
// response status, and latency. Recording is best-effort — individual failures
|
|
// are logged and do not affect the HTTP response. Shutdown is NOT best-effort:
|
|
// Flush must succeed (or time out, returning ErrAuditFlushTimeout) so that
|
|
// in-flight events are not lost when the audit recorder's connection pool is
|
|
// closed out from under the goroutines.
|
|
func NewAuditLog(recorder AuditRecorder, cfg AuditConfig) *AuditMiddleware {
|
|
excludeSet := make(map[string]bool, len(cfg.ExcludePaths))
|
|
for _, p := range cfg.ExcludePaths {
|
|
excludeSet[p] = true
|
|
}
|
|
|
|
logger := cfg.Logger
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
|
|
return &AuditMiddleware{
|
|
recorder: recorder,
|
|
logger: logger,
|
|
excludeSet: excludeSet,
|
|
}
|
|
}
|
|
|
|
// Middleware is the http.Handler wrapper. It has the standard
|
|
// func(http.Handler) http.Handler middleware signature so it can be composed
|
|
// into an existing middleware chain via a method value (auditMiddleware.Middleware).
|
|
func (a *AuditMiddleware) Middleware(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
// Skip excluded paths (health, readiness probes)
|
|
for prefix := range a.excludeSet {
|
|
if strings.HasPrefix(r.URL.Path, prefix) {
|
|
next.ServeHTTP(w, r)
|
|
return
|
|
}
|
|
}
|
|
|
|
start := time.Now()
|
|
|
|
// Hash request body for audit (don't store raw bodies — security + size concerns)
|
|
bodyHash := ""
|
|
if r.Body != nil && r.Body != http.NoBody {
|
|
hasher := sha256.New()
|
|
body, err := io.ReadAll(r.Body)
|
|
if err == nil && len(body) > 0 {
|
|
hasher.Write(body)
|
|
bodyHash = hex.EncodeToString(hasher.Sum(nil))[:16] // truncated hash
|
|
// Restore the body for downstream handlers
|
|
r.Body = io.NopCloser(strings.NewReader(string(body)))
|
|
}
|
|
}
|
|
|
|
// Extract actor from auth context
|
|
actor := "anonymous"
|
|
if user, ok := GetUser(r.Context()); ok && user != "" {
|
|
actor = user
|
|
}
|
|
|
|
// Wrap response writer to capture status code
|
|
wrapped := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
|
|
|
|
next.ServeHTTP(wrapped, r)
|
|
|
|
latency := time.Since(start).Milliseconds()
|
|
|
|
// Snapshot request-derived inputs so the goroutine does not race with
|
|
// the http.Server reusing r after this handler returns.
|
|
method := r.Method
|
|
path := r.URL.Path
|
|
status := wrapped.statusCode
|
|
|
|
// Derive a detached context that preserves request-scoped values
|
|
// (trace IDs, auth info carried via context keys) but is not cancelled
|
|
// when the HTTP server finalizes the request. Using r.Context()
|
|
// directly would cause the async audit write to observe ctx.Done()
|
|
// as soon as the response completes; using context.Background() would
|
|
// discard useful observability metadata. WithoutCancel gives us both
|
|
// (M-2 / D-3).
|
|
auditCtx := context.WithoutCancel(r.Context())
|
|
|
|
// Record audit event asynchronously (best-effort, don't block response).
|
|
// SECURITY: We intentionally use r.URL.Path (not r.URL.String() or r.RequestURI)
|
|
// to prevent query parameters from being recorded in the immutable audit trail.
|
|
// Query strings may contain cursor tokens, API keys passed as params, or other
|
|
// sensitive filter values. Since the audit trail is append-only with no deletion
|
|
// capability, any sensitive data recorded would persist permanently.
|
|
//
|
|
// The goroutine is tracked in a.wg so AuditMiddleware.Flush can drain
|
|
// in-flight recordings during graceful shutdown. Without this (M-1,
|
|
// CWE-662 / CWE-400), SIGTERM would close the DB pool while recordings
|
|
// were still mid-flight, silently dropping audit events.
|
|
a.wg.Add(1)
|
|
go func() {
|
|
defer a.wg.Done()
|
|
if err := a.recorder.RecordAPICall(
|
|
auditCtx,
|
|
method,
|
|
path,
|
|
actor,
|
|
bodyHash,
|
|
status,
|
|
latency,
|
|
); err != nil {
|
|
a.logger.Error("failed to record API audit event",
|
|
"error", err,
|
|
"method", method,
|
|
"path", path,
|
|
)
|
|
}
|
|
}()
|
|
})
|
|
}
|
|
|
|
// Flush blocks until every audit-recording goroutine spawned by Middleware has
|
|
// completed, or until ctx is cancelled / its deadline elapses. It must be
|
|
// called from the process shutdown path after http.Server.Shutdown has
|
|
// returned (so no new requests are being accepted) but before the backing
|
|
// audit recorder's resources (DB pool, etc.) are torn down.
|
|
//
|
|
// On timeout or cancellation Flush returns ErrAuditFlushTimeout wrapped with
|
|
// any context error; in-flight goroutines continue to run and may still write
|
|
// to the recorder once they unblock — the caller is responsible for deciding
|
|
// whether to proceed with teardown anyway or surface the error.
|
|
//
|
|
// Flush mirrors the idiom used by scheduler.Scheduler.WaitForCompletion so
|
|
// that the two subsystems drain identically at shutdown.
|
|
func (a *AuditMiddleware) Flush(ctx context.Context) error {
|
|
done := make(chan struct{})
|
|
go func() {
|
|
a.wg.Wait()
|
|
close(done)
|
|
}()
|
|
|
|
select {
|
|
case <-done:
|
|
a.logger.Info("audit middleware flush complete")
|
|
return nil
|
|
case <-ctx.Done():
|
|
a.logger.Warn("audit middleware flush did not complete before context cancellation",
|
|
"error", ctx.Err(),
|
|
)
|
|
return fmt.Errorf("%w: %w", ErrAuditFlushTimeout, ctx.Err())
|
|
}
|
|
}
|
|
|
|
// AuditServiceAdapter adapts the AuditService to the AuditRecorder interface.
|
|
// This keeps the middleware decoupled from the service package.
|
|
type AuditServiceAdapter struct {
|
|
recordFn func(ctx context.Context, actor string, actorType string, action string, resourceType string, resourceID string, details map[string]interface{}) error
|
|
}
|
|
|
|
// NewAuditServiceAdapter creates an adapter that bridges the middleware AuditRecorder
|
|
// interface to the service layer's RecordEvent method.
|
|
func NewAuditServiceAdapter(recordFn func(ctx context.Context, actor string, actorType string, action string, resourceType string, resourceID string, details map[string]interface{}) error) *AuditServiceAdapter {
|
|
return &AuditServiceAdapter{recordFn: recordFn}
|
|
}
|
|
|
|
// RecordAPICall implements AuditRecorder by translating API call data into an audit event.
|
|
func (a *AuditServiceAdapter) RecordAPICall(ctx context.Context, method, path, actor string, bodyHash string, status int, latencyMs int64) error {
|
|
details := map[string]interface{}{
|
|
"method": method,
|
|
"path": path,
|
|
"body_hash": bodyHash,
|
|
"status": status,
|
|
"latency_ms": latencyMs,
|
|
}
|
|
|
|
action := fmt.Sprintf("api_%s", strings.ToLower(method))
|
|
return a.recordFn(ctx, actor, "User", action, "api", path, details)
|
|
}
|