fix: end-to-end certificate lifecycle bugs + integration test environment

Fixes 12 production bugs preventing the full issuance→deployment flow
from working with ACME (Pebble/Let's Encrypt) and step-ca issuers:

ACME connector (acme.go):
- Save orderURI before WaitOrder overwrites it (Go crypto/acme bug)
- Add CreateOrderCert fallback via WaitOrder+FetchCert
- Remove defer-reset in ValidateConfig that caused nil pointer panic
- Add Insecure TLS option for self-signed ACME servers (Pebble)

step-ca connector (stepca.go, jwe.go):
- Real JWE provisioner key loading + decryption (was using ephemeral keys)
- Fix JWT audience (/1.0/sign), sha claim (key fingerprint), kid header
- Custom root CA trust via RootCertPath config
- Remove hardcoded 90-day validity default (let step-ca decide)

NGINX target connector (nginx.go):
- Use sh -c for validate/reload commands (shell interpretation)
- Use filepath.Dir instead of fragile string slicing
- Add private key file writing (agent-mode keys were never deployed)
- Make chain_path write conditional

Server/service layer:
- TriggerRenewalWithActor now creates actual Job records (was no-op)
- createDeploymentJobs falls back to DB query when cert.TargetIDs empty
- ProcessPendingJobs skips agent-routed deployment jobs
- Agent cert pickup path parsing: len(parts)<4 → len(parts)<3
- Health/ready/auth-info endpoints bypass auth middleware
- Write timeout 15s→120s for ACME issuance
- Cert fingerprint computed on CSR submission

Integration test environment (deploy/test/):
- 10-phase test script covering Local CA, ACME, step-ca, revocation,
  discovery, renewal, and API spot checks
- Docker Compose with 7 containers (server, agent, postgres, nginx,
  pebble, challtestsrv, step-ca) on isolated network
- TLS verification checks SAN (not just Subject CN) for modern CA compat

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
shankar0123
2026-04-02 17:02:20 -04:00
parent 2238f28610
commit b059ec930f
19 changed files with 2102 additions and 84 deletions
+33 -6
View File
@@ -112,6 +112,7 @@ func main() {
DNSPresentScript: os.Getenv("CERTCTL_ACME_DNS_PRESENT_SCRIPT"),
DNSCleanUpScript: os.Getenv("CERTCTL_ACME_DNS_CLEANUP_SCRIPT"),
DNSPersistIssuerDomain: os.Getenv("CERTCTL_ACME_DNS_PERSIST_ISSUER_DOMAIN"),
Insecure: cfg.ACME.Insecure,
}, logger)
logger.Info("initialized ACME issuer connector")
@@ -119,6 +120,7 @@ func main() {
// Uses the native /sign API with JWK provisioner authentication.
stepcaConnector := stepcaissuer.New(&stepcaissuer.Config{
CAURL: os.Getenv("CERTCTL_STEPCA_URL"),
RootCertPath: os.Getenv("CERTCTL_STEPCA_ROOT_CERT"),
ProvisionerName: os.Getenv("CERTCTL_STEPCA_PROVISIONER"),
ProvisionerKeyPath: os.Getenv("CERTCTL_STEPCA_KEY_PATH"),
ProvisionerPassword: os.Getenv("CERTCTL_STEPCA_PASSWORD"),
@@ -261,6 +263,8 @@ func main() {
certificateService.SetRevocationSvc(revocationSvc)
certificateService.SetCAOperationsSvc(caOperationsSvc)
certificateService.SetTargetRepo(targetRepo)
certificateService.SetJobRepo(jobRepo)
certificateService.SetKeygenMode(cfg.Keygen.Mode)
renewalService := service.NewRenewalService(certificateRepo, jobRepo, renewalPolicyRepo, profileRepo, auditService, notificationService, issuerRegistry, cfg.Keygen.Mode)
renewalService.SetTargetRepo(targetRepo)
deploymentService := service.NewDeploymentService(jobRepo, targetRepo, agentRepo, certificateRepo, auditService, notificationService)
@@ -504,13 +508,28 @@ func main() {
if _, err := os.Stat(webDir + "/index.html"); err != nil {
webDir = "./web"
}
// Health/ready routes bypass the full middleware stack (no auth required).
// These are registered on the inner router without auth, but the outer
// middleware chain wraps everything. Route them directly to the inner router.
noAuthHandler := middleware.Chain(apiRouter,
middleware.RequestID,
structuredLogger,
middleware.Recovery,
)
if _, err := os.Stat(webDir + "/index.html"); err == nil {
fileServer := http.FileServer(http.Dir(webDir))
finalHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
path := r.URL.Path
// API, health, and EST routes go to the API handler
if path == "/health" || path == "/ready" ||
(len(path) >= 8 && path[:8] == "/api/v1/") ||
// Health/ready and auth/info bypass auth middleware.
// Health/ready: Docker/K8s health probes don't carry Bearer tokens.
// auth/info: React app calls this before login to detect auth mode.
if path == "/health" || path == "/ready" || path == "/api/v1/auth/info" {
noAuthHandler.ServeHTTP(w, r)
return
}
// All other API and EST routes go through the full middleware stack (with auth)
if (len(path) >= 8 && path[:8] == "/api/v1/") ||
(len(path) >= 16 && path[:16] == "/.well-known/est") {
apiHandler.ServeHTTP(w, r)
return
@@ -525,7 +544,15 @@ func main() {
})
logger.Info("dashboard available at /", "web_dir", webDir)
} else {
finalHandler = apiHandler
// No dashboard: route health/auth-info without auth, everything else through full stack
finalHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
path := r.URL.Path
if path == "/health" || path == "/ready" || path == "/api/v1/auth/info" {
noAuthHandler.ServeHTTP(w, r)
return
}
apiHandler.ServeHTTP(w, r)
})
logger.Info("dashboard directory not found, serving API only")
}
@@ -534,9 +561,9 @@ func main() {
httpServer := &http.Server{
Addr: addr,
Handler: finalHandler,
ReadTimeout: 15 * time.Second,
ReadTimeout: 30 * time.Second,
ReadHeaderTimeout: 5 * time.Second,
WriteTimeout: 15 * time.Second,
WriteTimeout: 120 * time.Second, // Must accommodate ACME issuance (order + challenge + finalize)
IdleTimeout: 60 * time.Second,
}