mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 15:01:32 +00:00
fix(bundle-5): Operational Liveness + Bootstrap — 4 audit findings closed
Closes Audit-2026-04-25 H-006 (High), H-007 (High), M-011 (Medium),
L-006 (Low — verified-already-closed via C-1 master closure in v2.0.54).
Hardens the orchestrator-facing surface — k8s probes, agent enrollment,
shutdown audit drain, scheduler config plumbing.
What changed
- internal/api/handler/health.go — split contract:
* /health stays shallow 200 (k8s liveness — process alive)
* /ready accepts *sql.DB; runs db.PingContext(2s); 503 on failure
* Nil DB path returns 200 + db=not_configured (test fixtures)
- internal/api/handler/agent_bootstrap.go (NEW) — verifyBootstrapToken:
* empty expected = warn-mode pass-through
* non-empty = `Authorization: Bearer <token>` required
* crypto/subtle.ConstantTimeCompare; length-mismatch path runs dummy
compare to keep timing uniform
* ErrBootstrapTokenInvalid sentinel
- internal/api/handler/agents.go — RegisterAgent calls verifyBootstrapToken
BEFORE body parse so unauth probes don't even allocate a JSON decoder
- internal/config/config.go — two new env vars:
* CERTCTL_AGENT_BOOTSTRAP_TOKEN (Auth.AgentBootstrapToken)
* CERTCTL_AUDIT_FLUSH_TIMEOUT_SECONDS (Server.AuditFlushTimeoutSeconds)
- cmd/server/main.go — 3 changes:
* pass *sql.DB into NewHealthHandler (H-006)
* pass cfg.Auth.AgentBootstrapToken into NewAgentHandler (H-007)
* configurable shutdown audit-flush timeout (M-011)
* one-shot startup WARN when bootstrap token unset (deprecation)
- new tests: agent_bootstrap_test.go (full deny/accept/warn-mode coverage,
constant-time compare path, length-mismatch); health_test.go extended
with /ready DB-probe failure (503), nil-DB pass-through, /health-shallow
L-006 verified
- cmd/server/main.go:557 already calls
sched.SetShortLivedExpiryCheckInterval(cfg.Scheduler.ShortLivedExpiryCheckInterval)
per the C-1 master closure in v2.0.54. Bundle 5 confirms; no code change.
Threat model: TB-1 (operator/orchestrator), TB-2 (Agent↔Server).
- CWE-754 (Improper Check for Unusual or Exceptional Conditions) for H-006
- CWE-306 + CWE-288 (Missing Authentication for Critical Function) for H-007
Verification
- go vet ./... → clean
- go build ./... → clean
- go test -short -count=1 ./... → all packages pass
- targeted Bundle-5 regressions → all pass
- npx tsc --noEmit (web) → clean
- npx vitest run (web) → in-flight (sandbox 45s
ceiling exceeded; no failure markers in dot stream; no frontend
changes in this bundle so no regression risk)
- python3 yaml.safe_load(api/openapi.yaml) → 89 paths
Backward compatibility
- Bootstrap token defaults to empty (warn-mode) — existing demo
deployments unaffected. Server logs deprecation WARN; v2.2.0 will
require it.
- Audit flush timeout default 30s preserves prior behaviour.
- Helm chart already routes readiness probe to /ready (no chart change
needed); now /ready actually probes the DB.
Bundle 5 of the 2026-04-25 comprehensive audit.
This commit is contained in:
+33
-4
@@ -69,6 +69,19 @@ func main() {
|
||||
"server_host", cfg.Server.Host,
|
||||
"server_port", cfg.Server.Port)
|
||||
|
||||
// Bundle-5 / Audit H-007: deprecation WARN when the agent bootstrap
|
||||
// token is unset. Pre-Bundle-5 there was no token at all; the v2.0.x
|
||||
// default keeps the warn-mode pass-through so existing demo deploys
|
||||
// keep working, but operators must set CERTCTL_AGENT_BOOTSTRAP_TOKEN
|
||||
// before v2.2.0 lands. This is a one-shot startup line — the
|
||||
// per-request path stays silent so a busy registration endpoint
|
||||
// doesn't flood the log.
|
||||
if cfg.Auth.AgentBootstrapToken == "" {
|
||||
logger.Warn("agent bootstrap token unset (CERTCTL_AGENT_BOOTSTRAP_TOKEN) — agents may self-register without authentication; this default will become deny-by-default in v2.2.0; generate one with: openssl rand -hex 32")
|
||||
} else {
|
||||
logger.Info("agent bootstrap token configured (length redacted; constant-time compare on POST /api/v1/agents)")
|
||||
}
|
||||
|
||||
// Initialize database connection pool
|
||||
db, err := postgres.NewDB(cfg.Database.URL)
|
||||
if err != nil {
|
||||
@@ -433,7 +446,7 @@ func main() {
|
||||
certificateHandler := handler.NewCertificateHandler(certificateService)
|
||||
issuerHandler := handler.NewIssuerHandler(issuerService)
|
||||
targetHandler := handler.NewTargetHandler(targetService)
|
||||
agentHandler := handler.NewAgentHandler(agentService)
|
||||
agentHandler := handler.NewAgentHandler(agentService, cfg.Auth.AgentBootstrapToken)
|
||||
jobHandler := handler.NewJobHandler(jobService)
|
||||
policyHandler := handler.NewPolicyHandler(policyService)
|
||||
// G-1: RenewalPolicyHandler — /api/v1/renewal-policies CRUD. Value-returning
|
||||
@@ -448,7 +461,9 @@ func main() {
|
||||
notificationHandler := handler.NewNotificationHandler(notificationService)
|
||||
statsHandler := handler.NewStatsHandler(statsService)
|
||||
metricsHandler := handler.NewMetricsHandler(statsService, time.Now())
|
||||
healthHandler := handler.NewHealthHandler(cfg.Auth.Type)
|
||||
// Bundle-5 / H-006: pass the *sql.DB pool so /ready can probe DB
|
||||
// connectivity via PingContext. /health stays shallow (liveness signal).
|
||||
healthHandler := handler.NewHealthHandler(cfg.Auth.Type, db)
|
||||
// U-3 ride-along (cat-u-no_version_endpoint, P2): the version handler
|
||||
// answers GET /api/v1/version with build identity (ldflags Version,
|
||||
// VCS commit/dirty/timestamp, Go runtime version). Wired through the
|
||||
@@ -945,8 +960,22 @@ func main() {
|
||||
sig := <-sigChan
|
||||
logger.Info("received shutdown signal", "signal", sig.String())
|
||||
|
||||
// Graceful shutdown
|
||||
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
// Graceful shutdown.
|
||||
//
|
||||
// Bundle-5 / Audit M-011: pre-Bundle-5 the timeout was hard-coded
|
||||
// 30s, so high-volume operators couldn't extend the audit-flush
|
||||
// window without forking the binary. Now configurable via
|
||||
// CERTCTL_AUDIT_FLUSH_TIMEOUT_SECONDS (default 30s preserves prior
|
||||
// behaviour). The same context governs HTTP server shutdown +
|
||||
// scheduler completion + audit flush. WARN-log on deadline exceeded;
|
||||
// never exit hard — operator gets visibility, server still completes
|
||||
// shutdown.
|
||||
shutdownTimeout := time.Duration(cfg.Server.AuditFlushTimeoutSeconds) * time.Second
|
||||
if shutdownTimeout <= 0 {
|
||||
shutdownTimeout = 30 * time.Second
|
||||
}
|
||||
logger.Info("graceful shutdown budget", "timeout_seconds", int(shutdownTimeout/time.Second))
|
||||
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout)
|
||||
defer shutdownCancel()
|
||||
|
||||
cancel() // Stop scheduler
|
||||
|
||||
Reference in New Issue
Block a user