From cf632c0af42e148715a2ed1edf7355820c90d4ad Mon Sep 17 00:00:00 2001 From: Shankar Date: Thu, 2 Apr 2026 17:02:20 -0400 Subject: [PATCH] fix: end-to-end certificate lifecycle bugs + integration test environment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes 12 production bugs preventing the full issuance→deployment flow from working with ACME (Pebble/Let's Encrypt) and step-ca issuers: ACME connector (acme.go): - Save orderURI before WaitOrder overwrites it (Go crypto/acme bug) - Add CreateOrderCert fallback via WaitOrder+FetchCert - Remove defer-reset in ValidateConfig that caused nil pointer panic - Add Insecure TLS option for self-signed ACME servers (Pebble) step-ca connector (stepca.go, jwe.go): - Real JWE provisioner key loading + decryption (was using ephemeral keys) - Fix JWT audience (/1.0/sign), sha claim (key fingerprint), kid header - Custom root CA trust via RootCertPath config - Remove hardcoded 90-day validity default (let step-ca decide) NGINX target connector (nginx.go): - Use sh -c for validate/reload commands (shell interpretation) - Use filepath.Dir instead of fragile string slicing - Add private key file writing (agent-mode keys were never deployed) - Make chain_path write conditional Server/service layer: - TriggerRenewalWithActor now creates actual Job records (was no-op) - createDeploymentJobs falls back to DB query when cert.TargetIDs empty - ProcessPendingJobs skips agent-routed deployment jobs - Agent cert pickup path parsing: len(parts)<4 → len(parts)<3 - Health/ready/auth-info endpoints bypass auth middleware - Write timeout 15s→120s for ACME issuance - Cert fingerprint computed on CSR submission Integration test environment (deploy/test/): - 10-phase test script covering Local CA, ACME, step-ca, revocation, discovery, renewal, and API spot checks - Docker Compose with 7 containers (server, agent, postgres, nginx, pebble, challtestsrv, step-ca) on isolated network - TLS verification checks SAN (not just Subject CN) for modern CA compat Co-Authored-By: Claude Opus 4.6 --- cmd/server/main.go | 39 +- deploy/docker-compose.test.yml | 303 ++++++++ deploy/test/nginx-entrypoint.sh | 27 + deploy/test/nginx.conf | 42 ++ deploy/test/pebble-config.json | 16 + deploy/test/run-test.sh | 764 +++++++++++++++++++++ deploy/test/setup-trust.sh | 140 ++++ go.mod | 6 +- internal/api/handler/agents.go | 4 +- internal/config/config.go | 6 + internal/connector/issuer/acme/acme.go | 80 ++- internal/connector/issuer/stepca/jwe.go | 300 ++++++++ internal/connector/issuer/stepca/stepca.go | 148 +++- internal/connector/target/nginx/nginx.go | 46 +- internal/service/agent.go | 17 +- internal/service/certificate.go | 53 ++ internal/service/job.go | 10 + internal/service/renewal.go | 55 +- migrations/seed_test.sql | 130 ++++ 19 files changed, 2102 insertions(+), 84 deletions(-) create mode 100644 deploy/docker-compose.test.yml create mode 100755 deploy/test/nginx-entrypoint.sh create mode 100644 deploy/test/nginx.conf create mode 100644 deploy/test/pebble-config.json create mode 100755 deploy/test/run-test.sh create mode 100755 deploy/test/setup-trust.sh create mode 100644 internal/connector/issuer/stepca/jwe.go create mode 100644 migrations/seed_test.sql diff --git a/cmd/server/main.go b/cmd/server/main.go index 3947bbf..18ba773 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -112,6 +112,7 @@ func main() { DNSPresentScript: os.Getenv("CERTCTL_ACME_DNS_PRESENT_SCRIPT"), DNSCleanUpScript: os.Getenv("CERTCTL_ACME_DNS_CLEANUP_SCRIPT"), DNSPersistIssuerDomain: os.Getenv("CERTCTL_ACME_DNS_PERSIST_ISSUER_DOMAIN"), + Insecure: cfg.ACME.Insecure, }, logger) logger.Info("initialized ACME issuer connector") @@ -119,6 +120,7 @@ func main() { // Uses the native /sign API with JWK provisioner authentication. stepcaConnector := stepcaissuer.New(&stepcaissuer.Config{ CAURL: os.Getenv("CERTCTL_STEPCA_URL"), + RootCertPath: os.Getenv("CERTCTL_STEPCA_ROOT_CERT"), ProvisionerName: os.Getenv("CERTCTL_STEPCA_PROVISIONER"), ProvisionerKeyPath: os.Getenv("CERTCTL_STEPCA_KEY_PATH"), ProvisionerPassword: os.Getenv("CERTCTL_STEPCA_PASSWORD"), @@ -261,6 +263,8 @@ func main() { certificateService.SetRevocationSvc(revocationSvc) certificateService.SetCAOperationsSvc(caOperationsSvc) certificateService.SetTargetRepo(targetRepo) + certificateService.SetJobRepo(jobRepo) + certificateService.SetKeygenMode(cfg.Keygen.Mode) renewalService := service.NewRenewalService(certificateRepo, jobRepo, renewalPolicyRepo, profileRepo, auditService, notificationService, issuerRegistry, cfg.Keygen.Mode) renewalService.SetTargetRepo(targetRepo) deploymentService := service.NewDeploymentService(jobRepo, targetRepo, agentRepo, certificateRepo, auditService, notificationService) @@ -504,13 +508,28 @@ func main() { if _, err := os.Stat(webDir + "/index.html"); err != nil { webDir = "./web" } + // Health/ready routes bypass the full middleware stack (no auth required). + // These are registered on the inner router without auth, but the outer + // middleware chain wraps everything. Route them directly to the inner router. + noAuthHandler := middleware.Chain(apiRouter, + middleware.RequestID, + structuredLogger, + middleware.Recovery, + ) + if _, err := os.Stat(webDir + "/index.html"); err == nil { fileServer := http.FileServer(http.Dir(webDir)) finalHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { path := r.URL.Path - // API, health, and EST routes go to the API handler - if path == "/health" || path == "/ready" || - (len(path) >= 8 && path[:8] == "/api/v1/") || + // Health/ready and auth/info bypass auth middleware. + // Health/ready: Docker/K8s health probes don't carry Bearer tokens. + // auth/info: React app calls this before login to detect auth mode. + if path == "/health" || path == "/ready" || path == "/api/v1/auth/info" { + noAuthHandler.ServeHTTP(w, r) + return + } + // All other API and EST routes go through the full middleware stack (with auth) + if (len(path) >= 8 && path[:8] == "/api/v1/") || (len(path) >= 16 && path[:16] == "/.well-known/est") { apiHandler.ServeHTTP(w, r) return @@ -525,7 +544,15 @@ func main() { }) logger.Info("dashboard available at /", "web_dir", webDir) } else { - finalHandler = apiHandler + // No dashboard: route health/auth-info without auth, everything else through full stack + finalHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + path := r.URL.Path + if path == "/health" || path == "/ready" || path == "/api/v1/auth/info" { + noAuthHandler.ServeHTTP(w, r) + return + } + apiHandler.ServeHTTP(w, r) + }) logger.Info("dashboard directory not found, serving API only") } @@ -534,9 +561,9 @@ func main() { httpServer := &http.Server{ Addr: addr, Handler: finalHandler, - ReadTimeout: 15 * time.Second, + ReadTimeout: 30 * time.Second, ReadHeaderTimeout: 5 * time.Second, - WriteTimeout: 15 * time.Second, + WriteTimeout: 120 * time.Second, // Must accommodate ACME issuance (order + challenge + finalize) IdleTimeout: 60 * time.Second, } diff --git a/deploy/docker-compose.test.yml b/deploy/docker-compose.test.yml new file mode 100644 index 0000000..b4b895a --- /dev/null +++ b/deploy/docker-compose.test.yml @@ -0,0 +1,303 @@ +# ============================================================================= +# certctl Testing Environment — Docker Compose +# ============================================================================= +# +# Spins up the full certctl platform with real CA backends for manual QA: +# +# 1. PostgreSQL 16 — database (clean, no demo data) +# 2. certctl-server — control plane API + web dashboard on :8443 +# 3. certctl-agent — polls for work, deploys certs to NGINX +# 4. step-ca — private CA (JWK provisioner, auto-bootstraps) +# 5. Pebble — ACME test server (simulates Let's Encrypt) +# 6. pebble-challtestsrv — DNS/HTTP challenge test server for Pebble +# 7. NGINX — TLS target server on :8080 (HTTP) / :8444 (HTTPS) +# +# Usage: +# cd deploy +# docker compose -f docker-compose.test.yml up --build +# +# Dashboard: http://localhost:8443 +# API key: test-key-2026 +# NGINX: https://localhost:8444 (self-signed placeholder until cert deployed) +# +# See docs/test-env.md for the full walkthrough. +# ============================================================================= + +services: + + # --------------------------------------------------------------------------- + # Database + # --------------------------------------------------------------------------- + postgres: + image: postgres:16-alpine + container_name: certctl-test-postgres + environment: + POSTGRES_DB: certctl + POSTGRES_USER: certctl + POSTGRES_PASSWORD: testpass + volumes: + - test_postgres_data:/var/lib/postgresql/data + - ../migrations/000001_initial_schema.up.sql:/docker-entrypoint-initdb.d/001_schema.sql + - ../migrations/000002_agent_metadata.up.sql:/docker-entrypoint-initdb.d/002_agent_metadata.sql + - ../migrations/000003_certificate_profiles.up.sql:/docker-entrypoint-initdb.d/003_certificate_profiles.sql + - ../migrations/000004_agent_groups.up.sql:/docker-entrypoint-initdb.d/004_agent_groups.sql + - ../migrations/000005_revocation.up.sql:/docker-entrypoint-initdb.d/005_revocation.sql + - ../migrations/000006_discovery.up.sql:/docker-entrypoint-initdb.d/006_discovery.sql + - ../migrations/000007_network_discovery.up.sql:/docker-entrypoint-initdb.d/007_network_discovery.sql + - ../migrations/000008_verification.up.sql:/docker-entrypoint-initdb.d/008_verification.sql + - ../migrations/seed.sql:/docker-entrypoint-initdb.d/010_seed.sql + - ../migrations/seed_test.sql:/docker-entrypoint-initdb.d/015_seed_test.sql + # No seed_demo.sql — start with a clean database for real testing + networks: + certctl-test: + ipv4_address: 10.30.50.2 + healthcheck: + test: ["CMD-SHELL", "pg_isready -U certctl -d certctl"] + interval: 5s + timeout: 5s + retries: 5 + restart: unless-stopped + + # --------------------------------------------------------------------------- + # Pebble — ACME test server (simulates Let's Encrypt) + # --------------------------------------------------------------------------- + # Pebble is the official ACME test server from Let's Encrypt (RFC 8555). + # It validates challenges via the companion challtestsrv. + # Root CA cert available at https://pebble:15000/roots/0 (management API). + pebble-challtestsrv: + image: ghcr.io/letsencrypt/pebble-challtestsrv:latest + container_name: certctl-test-challtestsrv + # ENTRYPOINT is /app (the binary). command: provides only the FLAGS. + # Matches the official Pebble docker-compose format. + # -doh "" disables DoH (default :8443 would conflict with certctl server). + # defaultIPv4 must point to the certctl-server (10.30.50.6) because that's where + # the ACME HTTP-01 challenge server runs (port 80 inside the container). + # Pebble resolves domains via challtestsrv, then connects to this IP to validate. + command: -defaultIPv4 10.30.50.6 -defaultIPv6 "" -doh "" + networks: + certctl-test: + ipv4_address: 10.30.50.3 + restart: unless-stopped + + pebble: + image: ghcr.io/letsencrypt/pebble:latest + container_name: certctl-test-pebble + depends_on: + - pebble-challtestsrv + environment: + PEBBLE_VA_NOSLEEP: 1 + PEBBLE_VA_ALWAYS_VALID: 0 + # ENTRYPOINT is /app (the binary). command: provides only the FLAGS. + command: + - -config + - /test/config/pebble-config.json + - -dnsserver + - "10.30.50.3:8053" + - -strict + volumes: + - ./test/pebble-config.json:/test/config/pebble-config.json:ro + networks: + certctl-test: + ipv4_address: 10.30.50.4 + restart: unless-stopped + + # --------------------------------------------------------------------------- + # step-ca — Private CA (Smallstep) + # --------------------------------------------------------------------------- + # Auto-bootstraps on first run: generates root CA + JWK provisioner "admin". + # Root cert: /home/step/certs/root_ca.crt (inside stepca_data volume) + # Provisioner key: /home/step/secrets/provisioner_key (encrypted JWK) + step-ca: + image: smallstep/step-ca:latest + container_name: certctl-test-stepca + environment: + DOCKER_STEPCA_INIT_NAME: "certctl-test-ca" + DOCKER_STEPCA_INIT_DNS_NAMES: "step-ca,localhost" + DOCKER_STEPCA_INIT_PROVISIONER_NAME: "admin" + DOCKER_STEPCA_INIT_PASSWORD: "password123" + DOCKER_STEPCA_INIT_ADDRESS: ":9000" + volumes: + - stepca_data:/home/step + networks: + certctl-test: + ipv4_address: 10.30.50.5 + healthcheck: + test: ["CMD", "curl", "-fk", "https://localhost:9000/health"] + interval: 10s + timeout: 5s + start_period: 15s + retries: 10 + restart: unless-stopped + + # --------------------------------------------------------------------------- + # certctl Server (Control Plane) + # --------------------------------------------------------------------------- + # Connects to PostgreSQL, Pebble (ACME), step-ca, and Local CA. + # + # TLS trust problem: Pebble and step-ca use self-signed root CAs that + # aren't in Alpine's trust store. The ACME and step-ca connectors use + # Go's default http.Client (no InsecureSkipVerify), so they need the + # CA certs in the system trust store. + # + # Solution: setup-trust.sh runs as root, fetches Pebble CA from its + # management API, copies step-ca root cert from the shared volume, + # runs update-ca-certificates, then execs the server binary. + certctl-server: + build: + context: .. + dockerfile: Dockerfile + container_name: certctl-test-server + depends_on: + postgres: + condition: service_healthy + pebble: + condition: service_started + step-ca: + condition: service_healthy + # Run as root so update-ca-certificates can write to /etc/ssl/certs. + # Container isolation provides the security boundary. + user: "0:0" + entrypoint: ["/bin/sh", "/app/setup-trust.sh"] + environment: + # Database + CERTCTL_DATABASE_URL: postgres://certctl:testpass@postgres:5432/certctl?sslmode=disable + + # Server + CERTCTL_SERVER_HOST: 0.0.0.0 + CERTCTL_SERVER_PORT: 8443 + CERTCTL_LOG_LEVEL: debug + + # Auth — API key required (production-like) + CERTCTL_AUTH_TYPE: api-key + CERTCTL_AUTH_SECRET: test-key-2026 + + # Key generation — agent-side (production-like) + CERTCTL_KEYGEN_MODE: agent + + # Local CA issuer (iss-local) — self-signed mode (no CA cert/key paths) + # This is the simplest issuer, always available. + + # ACME issuer (iss-acme-staging) — pointed at Pebble + CERTCTL_ACME_DIRECTORY_URL: https://pebble:14000/dir + CERTCTL_ACME_EMAIL: test@certctl.dev + CERTCTL_ACME_CHALLENGE_TYPE: http-01 + CERTCTL_ACME_INSECURE: "true" + + # step-ca issuer (iss-stepca) + CERTCTL_STEPCA_URL: https://step-ca:9000 + CERTCTL_STEPCA_ROOT_CERT: /stepca-data/certs/root_ca.crt + CERTCTL_STEPCA_PROVISIONER: admin + CERTCTL_STEPCA_PASSWORD: password123 + CERTCTL_STEPCA_KEY_PATH: /stepca-data/secrets/provisioner_key + + # Network scanning + CERTCTL_NETWORK_SCAN_ENABLED: "true" + + # Post-deployment TLS verification + CERTCTL_VERIFY_DEPLOYMENT: "true" + CERTCTL_VERIFY_TIMEOUT: "10s" + CERTCTL_VERIFY_DELAY: "3s" + ports: + - "8443:8443" + volumes: + - ./test/setup-trust.sh:/app/setup-trust.sh:ro + # step-ca data volume (root cert at /certs/root_ca.crt, key at /secrets/provisioner_key) + - stepca_data:/stepca-data:ro + networks: + certctl-test: + ipv4_address: 10.30.50.6 + healthcheck: + # /health requires auth when CERTCTL_AUTH_TYPE=api-key, so include the Bearer token + test: ["CMD", "curl", "-f", "-H", "Authorization: Bearer test-key-2026", "http://localhost:8443/health"] + interval: 10s + timeout: 5s + start_period: 30s + retries: 10 + restart: unless-stopped + + # --------------------------------------------------------------------------- + # NGINX — TLS Target Server + # --------------------------------------------------------------------------- + # The agent deploys certificates here via the shared nginx_certs volume. + # nginx-entrypoint.sh generates a self-signed placeholder cert so NGINX + # can boot before the agent deploys a real cert. + # + # Ports: 8080 (HTTP) / 8444 (HTTPS) — offset to avoid conflict with server. + nginx: + image: nginx:alpine + container_name: certctl-test-nginx + entrypoint: ["/bin/sh", "/entrypoint.sh"] + volumes: + - ./test/nginx.conf:/etc/nginx/nginx.conf:ro + - ./test/nginx-entrypoint.sh:/entrypoint.sh:ro + - nginx_certs:/etc/nginx/certs + ports: + - "8080:80" + - "8444:443" + networks: + certctl-test: + ipv4_address: 10.30.50.7 + healthcheck: + test: ["CMD-SHELL", "curl -fk https://localhost/health || exit 1"] + interval: 10s + timeout: 5s + start_period: 15s + retries: 5 + restart: unless-stopped + + # --------------------------------------------------------------------------- + # certctl Agent + # --------------------------------------------------------------------------- + # Polls the server for work, generates ECDSA P-256 keys locally, + # deploys certs to NGINX via the shared volume, and discovers existing + # certs in the NGINX cert directory. + certctl-agent: + build: + context: .. + dockerfile: Dockerfile.agent + container_name: certctl-test-agent + depends_on: + certctl-server: + condition: service_healthy + environment: + CERTCTL_SERVER_URL: http://certctl-server:8443 + CERTCTL_API_KEY: test-key-2026 + CERTCTL_AGENT_NAME: test-agent-01 + CERTCTL_AGENT_ID: agent-test-01 + CERTCTL_KEYGEN_MODE: agent + CERTCTL_LOG_LEVEL: debug + CERTCTL_DISCOVERY_DIRS: /nginx-certs + volumes: + - agent_keys:/var/lib/certctl/keys + - nginx_certs:/nginx-certs + networks: + certctl-test: + ipv4_address: 10.30.50.8 + restart: unless-stopped + +# ============================================================================= +# Network +# ============================================================================= +# Static IPs are required because: +# - Pebble needs to know the challtestsrv DNS server address (10.30.50.3) +# - challtestsrv resolves all domains to certctl-server (10.30.50.6) for HTTP-01 challenges +# - Avoids DNS race conditions during startup +networks: + certctl-test: + driver: bridge + ipam: + config: + - subnet: 10.30.50.0/24 + +# ============================================================================= +# Volumes +# ============================================================================= +volumes: + test_postgres_data: + driver: local + stepca_data: + driver: local + agent_keys: + driver: local + nginx_certs: + driver: local diff --git a/deploy/test/nginx-entrypoint.sh b/deploy/test/nginx-entrypoint.sh new file mode 100755 index 0000000..1736aba --- /dev/null +++ b/deploy/test/nginx-entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/sh +# Generate a self-signed placeholder certificate so NGINX can boot +# before the certctl agent deploys a real certificate. +# Once the agent deploys, it overwrites these files and reloads NGINX. + +CERT_DIR="/etc/nginx/certs" +mkdir -p "$CERT_DIR" + +# Make cert directory world-writable so the certctl-agent container +# (which shares this volume) can overwrite the placeholder certs. +chmod 777 "$CERT_DIR" + +if [ ! -f "$CERT_DIR/cert.pem" ]; then + echo "Generating self-signed placeholder certificate..." + apk add --no-cache openssl > /dev/null 2>&1 + openssl req -x509 -nodes -days 1 -newkey ec -pkeyopt ec_paramgen_curve:prime256v1 \ + -keyout "$CERT_DIR/key.pem" \ + -out "$CERT_DIR/cert.pem" \ + -subj "/CN=placeholder.certctl.test" \ + 2>/dev/null + # Make placeholder certs writable by the agent container + chmod 666 "$CERT_DIR/cert.pem" "$CERT_DIR/key.pem" + echo "Placeholder certificate generated." +fi + +# Start NGINX in foreground +exec nginx -g "daemon off;" diff --git a/deploy/test/nginx.conf b/deploy/test/nginx.conf new file mode 100644 index 0000000..6a30ae5 --- /dev/null +++ b/deploy/test/nginx.conf @@ -0,0 +1,42 @@ +# NGINX configuration for certctl test environment. +# The agent deploys certificates to /etc/nginx/certs/ and reloads NGINX. +# On startup, NGINX uses a self-signed placeholder so it can boot before any cert is deployed. + +# Generate a self-signed placeholder on container start (see entrypoint in compose). +# Once the agent deploys a real cert, it overwrites these files and reloads. + +events { + worker_connections 1024; +} + +http { + # HTTP → redirect to HTTPS (optional, for realism) + server { + listen 80; + server_name _; + return 301 https://$host$request_uri; + } + + # HTTPS server — serves whatever cert the agent has deployed + server { + listen 443 ssl; + server_name _; + + ssl_certificate /etc/nginx/certs/cert.pem; + ssl_certificate_key /etc/nginx/certs/key.pem; + + # Modern TLS settings + ssl_protocols TLSv1.2 TLSv1.3; + ssl_prefer_server_ciphers off; + + location / { + default_type text/plain; + return 200 'certctl test environment — NGINX is serving TLS\n'; + } + + location /health { + default_type text/plain; + return 200 'ok\n'; + } + } +} diff --git a/deploy/test/pebble-config.json b/deploy/test/pebble-config.json new file mode 100644 index 0000000..375f125 --- /dev/null +++ b/deploy/test/pebble-config.json @@ -0,0 +1,16 @@ +{ + "pebble": { + "listenAddress": "0.0.0.0:14000", + "managementListenAddress": "0.0.0.0:15000", + "certificate": "test/certs/localhost/cert.pem", + "privateKey": "test/certs/localhost/key.pem", + "httpPort": 80, + "tlsPort": 443, + "ocspResponderURL": "", + "externalAccountBindingRequired": false, + "retryAfter": { + "authz": 3, + "order": 5 + } + } +} diff --git a/deploy/test/run-test.sh b/deploy/test/run-test.sh new file mode 100755 index 0000000..7e061cb --- /dev/null +++ b/deploy/test/run-test.sh @@ -0,0 +1,764 @@ +#!/usr/bin/env bash +# ============================================================================= +# certctl End-to-End Test Script +# ============================================================================= +# +# Automates the full lifecycle test from docs/test-env.md: +# 1. Bring up all 7 containers (build from source) +# 2. Wait for every service to be healthy +# 3. Verify pre-seeded data (agents, issuers, targets) +# 4. Issue a certificate via Local CA → deploy to NGINX → verify TLS +# 5. Issue a certificate via ACME/Pebble → verify +# 6. Issue a certificate via step-ca → verify +# 7. Test revocation + CRL +# 8. Test discovery +# 9. Test renewal (re-issue step-ca cert, check version history) +# 10. Print summary +# +# Usage: +# cd certctl/deploy +# ./test/run-test.sh # full run (build + test) +# ./test/run-test.sh --no-build # skip docker build, reuse existing containers +# ./test/run-test.sh --no-teardown # leave containers running after test +# +# Requirements: docker, curl, openssl, jq (or python3 for json parsing) +# ============================================================================= + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- +COMPOSE_FILE="docker-compose.test.yml" +API_URL="http://localhost:8443" +API_KEY="test-key-2026" +NGINX_TLS="localhost:8444" +AUTH_HEADER="Authorization: Bearer ${API_KEY}" + +# Flags +BUILD=true +TEARDOWN=true +for arg in "$@"; do + case "$arg" in + --no-build) BUILD=false ;; + --no-teardown) TEARDOWN=false ;; + esac +done + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +PASS=0 +FAIL=0 +SKIP=0 + +pass() { + PASS=$((PASS + 1)) + echo -e " ${GREEN}PASS${NC} $1" +} + +fail() { + FAIL=$((FAIL + 1)) + echo -e " ${RED}FAIL${NC} $1" + if [ -n "${2:-}" ]; then + echo -e " ${RED}$2${NC}" + fi +} + +skip() { + SKIP=$((SKIP + 1)) + echo -e " ${YELLOW}SKIP${NC} $1" +} + +info() { + echo -e "${CYAN}==>${NC} $1" +} + +header() { + echo "" + echo -e "${BOLD}─── $1 ───${NC}" +} + +# API helper: GET endpoint, return JSON body. Exits 1 on HTTP error. +api_get() { + local path="$1" + curl -sf -H "${AUTH_HEADER}" "${API_URL}${path}" 2>/dev/null +} + +# API helper: POST with optional JSON body +api_post() { + local path="$1" + local body="${2:-}" + if [ -n "$body" ]; then + curl -sf -X POST -H "${AUTH_HEADER}" -H "Content-Type: application/json" \ + -d "$body" "${API_URL}${path}" 2>/dev/null + else + curl -sf -X POST -H "${AUTH_HEADER}" "${API_URL}${path}" 2>/dev/null + fi +} + +# Wait for an HTTP endpoint to return 200. Retries with backoff. +wait_for_http() { + local url="$1" + local label="$2" + local max_wait="${3:-120}" + local elapsed=0 + local interval=3 + + while [ $elapsed -lt $max_wait ]; do + if curl -sf -H "${AUTH_HEADER}" "$url" >/dev/null 2>&1; then + return 0 + fi + sleep $interval + elapsed=$((elapsed + interval)) + done + return 1 +} + +# Extract a field from JSON using python3 (no jq dependency) +json_field() { + python3 -c "import sys,json; d=json.load(sys.stdin); print($1)" 2>/dev/null +} + +# Wait for a job to reach a terminal state (Completed or Failed) +# Usage: wait_for_job +# Returns 0 if Completed, 1 if Failed/timeout +wait_for_jobs_done() { + local cert_id="$1" + local max_wait="${2:-180}" + local elapsed=0 + local interval=5 + + while [ $elapsed -lt $max_wait ]; do + local jobs_json + jobs_json=$(api_get "/api/v1/jobs" 2>/dev/null || echo '{"data":[]}') + + # Check if all jobs for this cert are in terminal state + # API returns jobs under "data" key (not "jobs") + local pending + pending=$(echo "$jobs_json" | python3 -c " +import sys, json +data = json.load(sys.stdin) +jobs = data.get('data') or data.get('jobs') or [] +active = [j for j in jobs if j.get('certificate_id') == '$cert_id' + and j.get('status') not in ('Completed', 'Failed', 'Cancelled')] +print(len(active)) +" 2>/dev/null || echo "99") + + if [ "$pending" = "0" ]; then + # Check how many jobs exist and their terminal states + local job_counts + job_counts=$(echo "$jobs_json" | python3 -c " +import sys, json +data = json.load(sys.stdin) +jobs = data.get('data') or data.get('jobs') or [] +mine = [j for j in jobs if j.get('certificate_id') == '$cert_id'] +completed = len([j for j in mine if j.get('status') == 'Completed']) +failed = len([j for j in mine if j.get('status') in ('Failed', 'Cancelled')]) +print(f'{len(mine)} {completed} {failed}') +" 2>/dev/null || echo "0 0 0") + local total_jobs completed_jobs failed_jobs + total_jobs=$(echo "$job_counts" | cut -d' ' -f1) + completed_jobs=$(echo "$job_counts" | cut -d' ' -f2) + failed_jobs=$(echo "$job_counts" | cut -d' ' -f3) + + if [ "$completed_jobs" -gt 0 ]; then + return 0 # At least one job completed successfully + fi + if [ "$total_jobs" -gt 0 ] && [ "$failed_jobs" -gt 0 ]; then + return 1 # All jobs are in terminal state but none completed — all failed + fi + fi + + sleep $interval + elapsed=$((elapsed + interval)) + done + return 1 +} + +# Get the TLS cert subject from NGINX for a given SNI +get_tls_subject() { + local sni="$1" + echo | openssl s_client -connect "$NGINX_TLS" -servername "$sni" 2>/dev/null \ + | openssl x509 -noout -subject 2>/dev/null \ + | sed 's/subject=//' | sed 's/^ *//' +} + +get_tls_issuer() { + local sni="$1" + echo | openssl s_client -connect "$NGINX_TLS" -servername "$sni" 2>/dev/null \ + | openssl x509 -noout -issuer 2>/dev/null \ + | sed 's/issuer=//' | sed 's/^ *//' +} + +# Get the TLS cert SANs from NGINX for a given SNI +# Modern CAs (including Let's Encrypt / Pebble) put domains only in SAN, not Subject CN. +get_tls_san() { + local sni="$1" + echo | openssl s_client -connect "$NGINX_TLS" -servername "$sni" 2>/dev/null \ + | openssl x509 -noout -ext subjectAltName 2>/dev/null \ + | grep -i "DNS:" | sed 's/^ *//' +} + +# Check if NGINX is serving a cert that matches the given domain (checks Subject then SAN) +check_tls_identity() { + local domain="$1" + local subject issuer san + subject=$(get_tls_subject "$domain") + issuer=$(get_tls_issuer "$domain") + san=$(get_tls_san "$domain") + if echo "$subject" | grep -qi "$domain" || echo "$san" | grep -qi "$domain"; then + echo "MATCH" + echo "Subject: $subject" + echo "SAN: $san" + echo "Issuer: $issuer" + else + echo "NO_MATCH" + echo "Subject: $subject" + echo "SAN: $san" + echo "Issuer: $issuer" + fi +} + +# SQL exec in the postgres container +psql_exec() { + docker exec certctl-test-postgres psql -U certctl -d certctl -tAc "$1" 2>/dev/null +} + +# --------------------------------------------------------------------------- +# Cleanup trap +# --------------------------------------------------------------------------- +cleanup() { + if [ "$TEARDOWN" = true ]; then + info "Tearing down test environment..." + docker compose -f "$COMPOSE_FILE" down -v >/dev/null 2>&1 || true + else + info "Leaving containers running (--no-teardown)" + fi +} + +# --------------------------------------------------------------------------- +# PHASE 0: Environment Check +# --------------------------------------------------------------------------- +header "Phase 0: Environment Check" + +# Make sure we're in the deploy directory +if [ ! -f "$COMPOSE_FILE" ]; then + echo -e "${RED}ERROR: $COMPOSE_FILE not found.${NC}" + echo "Run this script from the certctl/deploy directory:" + echo " cd certctl/deploy && ./test/run-test.sh" + exit 1 +fi + +for cmd in docker curl openssl python3; do + if command -v "$cmd" >/dev/null 2>&1; then + pass "$cmd available" + else + fail "$cmd not found" "Install $cmd and try again" + exit 1 + fi +done + +if docker compose version >/dev/null 2>&1; then + pass "docker compose available" +else + fail "docker compose not available" "Install Docker Compose v2+" + exit 1 +fi + +# --------------------------------------------------------------------------- +# PHASE 1: Start the Stack +# --------------------------------------------------------------------------- +header "Phase 1: Start Test Environment" + +# Teardown any previous run +info "Cleaning up previous test environment..." +docker compose -f "$COMPOSE_FILE" down -v >/dev/null 2>&1 || true + +# Set the cleanup trap AFTER the initial teardown +trap cleanup EXIT + +if [ "$BUILD" = true ]; then + info "Building and starting containers (this takes 2-5 minutes on first run)..." + docker compose -f "$COMPOSE_FILE" up --build -d 2>&1 | tail -5 +else + info "Starting containers (--no-build)..." + docker compose -f "$COMPOSE_FILE" up -d 2>&1 | tail -5 +fi + +# --------------------------------------------------------------------------- +# PHASE 2: Wait for Services +# --------------------------------------------------------------------------- +header "Phase 2: Waiting for Services" + +info "Waiting for PostgreSQL..." +if docker compose -f "$COMPOSE_FILE" exec -T postgres pg_isready -U certctl -d certctl >/dev/null 2>&1 || + wait_for_http "${API_URL}/health" "postgres" 60; then + pass "PostgreSQL ready" +else + fail "PostgreSQL not ready after 60s" +fi + +info "Waiting for certctl server..." +if wait_for_http "${API_URL}/health" "server" 120; then + pass "certctl server healthy" + # Show trust setup + connector init for debugging + echo " --- Server startup (trust setup) ---" + docker logs certctl-test-server 2>&1 | grep -E "trust|Added|Extract|provisioner|Pre-launch|key file|WARNING|CERTCTL_" | head -15 + echo " ---" +else + fail "certctl server not healthy after 120s" + echo "" + echo "Server logs:" + docker logs certctl-test-server --tail 30 + exit 1 +fi + +info "Waiting for NGINX..." +if wait_for_http "http://localhost:8080" "nginx" 30; then + pass "NGINX healthy" +else + # NGINX might not respond to plain curl on /health without the right path + # Check docker health instead + if docker inspect certctl-test-nginx --format='{{.State.Health.Status}}' 2>/dev/null | grep -q healthy; then + pass "NGINX healthy (docker healthcheck)" + else + skip "NGINX health check inconclusive (will verify via TLS later)" + fi +fi + +# Give the agent a few seconds to register and send first heartbeat +info "Waiting for agent heartbeat (up to 45s)..." +AGENT_READY=false +for i in $(seq 1 15); do + AGENT_STATUS=$(api_get "/api/v1/agents/agent-test-01" 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "") + if [ "$AGENT_STATUS" = "online" ]; then + AGENT_READY=true + break + fi + sleep 3 +done +if [ "$AGENT_READY" = true ]; then + pass "Agent online" +else + skip "Agent not yet online (may be slow to heartbeat — continuing)" +fi + +# --------------------------------------------------------------------------- +# PHASE 3: Verify Pre-Seeded Data +# --------------------------------------------------------------------------- +header "Phase 3: Verify Pre-Seeded Data" + +# Agents +AGENT_COUNT=$(api_get "/api/v1/agents" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total',0))" 2>/dev/null || echo 0) +if [ "$AGENT_COUNT" -ge 2 ]; then + pass "Agents: $AGENT_COUNT found (agent-test-01 + server-scanner)" +else + fail "Agents: expected >= 2, got $AGENT_COUNT" +fi + +# Issuers +ISSUER_COUNT=$(api_get "/api/v1/issuers" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total',0))" 2>/dev/null || echo 0) +if [ "$ISSUER_COUNT" -ge 3 ]; then + pass "Issuers: $ISSUER_COUNT found (iss-local, iss-acme-staging, iss-stepca)" +else + fail "Issuers: expected >= 3, got $ISSUER_COUNT" "Check seed_test.sql loaded correctly" +fi + +# Targets +TARGET_COUNT=$(api_get "/api/v1/targets" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total',0))" 2>/dev/null || echo 0) +if [ "$TARGET_COUNT" -ge 1 ]; then + pass "Targets: $TARGET_COUNT found (target-test-nginx)" +else + fail "Targets: expected >= 1, got $TARGET_COUNT" "seed_test.sql may have failed after iss-local" +fi + +# Profile +PROFILE_RESP=$(api_get "/api/v1/profiles" 2>/dev/null || echo '{"total":0}') +PROFILE_COUNT=$(echo "$PROFILE_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total',0))" 2>/dev/null || echo 0) +if [ "$PROFILE_COUNT" -ge 1 ]; then + pass "Profiles: $PROFILE_COUNT found (prof-test-tls)" +else + fail "Profiles: expected >= 1, got $PROFILE_COUNT" +fi + +# Bail if seed data is broken +if [ "$ISSUER_COUNT" -lt 3 ] || [ "$TARGET_COUNT" -lt 1 ]; then + echo "" + echo -e "${RED}Seed data is incomplete. Cannot continue.${NC}" + echo "Check PostgreSQL logs: docker logs certctl-test-postgres" + exit 1 +fi + +# --------------------------------------------------------------------------- +# PHASE 4: Local CA Issuance +# --------------------------------------------------------------------------- +header "Phase 4: Local CA Certificate Issuance" + +info "Creating certificate record mc-local-test..." +CREATE_RESP=$(api_post "/api/v1/certificates" '{ + "id": "mc-local-test", + "name": "local-test-cert", + "common_name": "local.certctl.test", + "sans": ["local.certctl.test"], + "issuer_id": "iss-local", + "owner_id": "owner-test-admin", + "team_id": "team-test-ops", + "renewal_policy_id": "rp-default", + "certificate_profile_id": "prof-test-tls", + "environment": "development" +}' 2>/dev/null || echo "ERROR") + +if echo "$CREATE_RESP" | python3 -c "import sys,json; d=json.load(sys.stdin); assert d.get('id')=='mc-local-test'" 2>/dev/null; then + pass "Certificate record created" +else + fail "Certificate creation failed" "$CREATE_RESP" +fi + +info "Linking certificate to NGINX target..." +psql_exec "INSERT INTO certificate_target_mappings (certificate_id, target_id) VALUES ('mc-local-test', 'target-test-nginx') ON CONFLICT DO NOTHING;" +pass "Target mapping inserted" + +info "Triggering issuance..." +RENEW_RESP=$(api_post "/api/v1/certificates/mc-local-test/renew" 2>/dev/null || echo "ERROR") +if echo "$RENEW_RESP" | grep -q "renewal_triggered\|status"; then + pass "Issuance triggered" +else + fail "Trigger failed" "$RENEW_RESP" +fi + +# Verify a job was created (this is the bug fix check) +sleep 2 +JOB_COUNT=$(api_get "/api/v1/jobs" | python3 -c " +import sys, json +data = json.load(sys.stdin) +jobs = [j for j in (data.get('data') or data.get('jobs') or []) if j.get('certificate_id') == 'mc-local-test'] +print(len(jobs)) +" 2>/dev/null || echo "0") + +if [ "$JOB_COUNT" -gt 0 ]; then + pass "Job created ($JOB_COUNT jobs for mc-local-test)" +else + fail "No jobs created — TriggerRenewalWithActor bug still present" +fi + +info "Waiting for issuance + deployment (up to 180s)..." +if wait_for_jobs_done "mc-local-test" 180; then + pass "All jobs completed" +else + fail "Jobs did not complete within 180s" + echo " Current jobs:" + api_get "/api/v1/jobs" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -30 +fi + +info "Reloading NGINX to pick up deployed certificate..." +docker exec certctl-test-nginx nginx -s reload 2>/dev/null || true +sleep 3 + +info "Verifying TLS certificate on NGINX..." +TLS_CHECK=$(check_tls_identity "local.certctl.test") +TLS_RESULT=$(echo "$TLS_CHECK" | head -1) +if [ "$TLS_RESULT" = "MATCH" ]; then + pass "NGINX serving cert for local.certctl.test" + echo "$TLS_CHECK" | tail -n +2 | while read -r line; do echo -e " $line"; done +else + fail "NGINX not serving expected cert" "$(echo "$TLS_CHECK" | tail -n +2 | tr '\n' ', ')" +fi + +# Check cert status in API +CERT_STATUS=$(api_get "/api/v1/certificates/mc-local-test" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "unknown") +if [ "$CERT_STATUS" = "Active" ]; then + pass "Certificate status: Active" +else + skip "Certificate status: $CERT_STATUS (expected Active — may need more time)" +fi + +# --------------------------------------------------------------------------- +# PHASE 5: ACME (Pebble) Issuance +# --------------------------------------------------------------------------- +header "Phase 5: ACME (Pebble) Certificate Issuance" + +info "Creating certificate record mc-acme-test..." +CREATE_RESP=$(api_post "/api/v1/certificates" '{ + "id": "mc-acme-test", + "name": "acme-test-cert", + "common_name": "acme.certctl.test", + "sans": ["acme.certctl.test"], + "issuer_id": "iss-acme-staging", + "owner_id": "owner-test-admin", + "team_id": "team-test-ops", + "renewal_policy_id": "rp-default", + "certificate_profile_id": "prof-test-tls", + "environment": "staging" +}' 2>/dev/null || echo "ERROR") + +if echo "$CREATE_RESP" | python3 -c "import sys,json; d=json.load(sys.stdin); assert d.get('id')=='mc-acme-test'" 2>/dev/null; then + pass "Certificate record created" +else + fail "Certificate creation failed" "$CREATE_RESP" +fi + +info "Linking to target and triggering issuance..." +psql_exec "INSERT INTO certificate_target_mappings (certificate_id, target_id) VALUES ('mc-acme-test', 'target-test-nginx') ON CONFLICT DO NOTHING;" +RENEW_RESP=$(api_post "/api/v1/certificates/mc-acme-test/renew" 2>/dev/null || echo "ERROR") +if echo "$RENEW_RESP" | grep -q "renewal_triggered\|status"; then + pass "Issuance triggered" +else + fail "Trigger failed" "$RENEW_RESP" +fi + +info "Waiting for ACME issuance + deployment (up to 180s)..." +if wait_for_jobs_done "mc-acme-test" 180; then + pass "All jobs completed" + + info "Reloading NGINX to pick up deployed certificate..." + docker exec certctl-test-nginx nginx -s reload 2>/dev/null || true + sleep 3 + + TLS_CHECK=$(check_tls_identity "acme.certctl.test") + TLS_RESULT=$(echo "$TLS_CHECK" | head -1) + if [ "$TLS_RESULT" = "MATCH" ]; then + pass "NGINX serving cert for acme.certctl.test" + echo "$TLS_CHECK" | tail -n +2 | while read -r line; do echo -e " $line"; done + else + fail "NGINX not serving expected ACME cert" "$(echo "$TLS_CHECK" | tail -n +2 | tr '\n' ', ')" + fi +else + fail "ACME jobs did not complete within 180s" + info "Checking ACME job status..." + api_get "/api/v1/jobs" 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +for j in data.get('data', []): + if j.get('certificate_id') == 'mc-acme-test': + print(f\" Job {j['id']}: type={j['type']} status={j['status']} error={j.get('last_error','')}\")" 2>/dev/null || true + echo " Server logs (last 20 lines):" + docker logs certctl-test-server --tail 20 2>&1 | grep -i "acme\|error\|fail\|CSR" | head -10 || true +fi + +# --------------------------------------------------------------------------- +# PHASE 6: step-ca Issuance +# --------------------------------------------------------------------------- +header "Phase 6: step-ca (Private CA) Certificate Issuance" + +info "Creating certificate record mc-stepca-test..." +CREATE_RESP=$(api_post "/api/v1/certificates" '{ + "id": "mc-stepca-test", + "name": "stepca-test-cert", + "common_name": "stepca.certctl.test", + "sans": ["stepca.certctl.test"], + "issuer_id": "iss-stepca", + "owner_id": "owner-test-admin", + "team_id": "team-test-ops", + "renewal_policy_id": "rp-default", + "certificate_profile_id": "prof-test-tls", + "environment": "staging" +}' 2>/dev/null || echo "ERROR") + +if echo "$CREATE_RESP" | python3 -c "import sys,json; d=json.load(sys.stdin); assert d.get('id')=='mc-stepca-test'" 2>/dev/null; then + pass "Certificate record created" +else + fail "Certificate creation failed" "$CREATE_RESP" +fi + +info "Linking to target and triggering issuance..." +psql_exec "INSERT INTO certificate_target_mappings (certificate_id, target_id) VALUES ('mc-stepca-test', 'target-test-nginx') ON CONFLICT DO NOTHING;" +RENEW_RESP=$(api_post "/api/v1/certificates/mc-stepca-test/renew" 2>/dev/null || echo "ERROR") +if echo "$RENEW_RESP" | grep -q "renewal_triggered\|status"; then + pass "Issuance triggered" +else + fail "Trigger failed" "$RENEW_RESP" +fi + +info "Waiting for step-ca issuance + deployment (up to 120s)..." +if wait_for_jobs_done "mc-stepca-test" 120; then + pass "All jobs completed" +else + fail "Jobs did not complete in time" + info "Checking step-ca job status..." + api_get "/api/v1/jobs" 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +for j in data.get('data', []): + if j.get('certificate_id') == 'mc-stepca-test': + print(f\" Job {j['id']}: type={j['type']} status={j['status']} error={j.get('last_error','')}\")" 2>/dev/null || true + echo " Server logs (step-ca related):" + docker logs certctl-test-server --tail 30 2>&1 | grep -i "stepca\|step-ca\|provisioner\|jwe\|decrypt\|CSR.*fail\|error" | head -10 || true +fi + +# --------------------------------------------------------------------------- +# PHASE 7: Revocation +# --------------------------------------------------------------------------- +header "Phase 7: Revocation" + +info "Revoking mc-local-test (reason: superseded)..." +REVOKE_RESP=$(api_post "/api/v1/certificates/mc-local-test/revoke" '{"reason": "superseded"}' 2>/dev/null || echo "ERROR") +if echo "$REVOKE_RESP" | grep -qi "revoked\|status"; then + pass "Certificate revoked" +else + fail "Revocation failed" "$REVOKE_RESP" +fi + +info "Checking CRL..." +CRL_RESP=$(api_get "/api/v1/crl" 2>/dev/null || echo '{"total":0}') +CRL_TOTAL=$(echo "$CRL_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total',0))" 2>/dev/null || echo 0) +if [ "$CRL_TOTAL" -ge 1 ]; then + pass "CRL contains $CRL_TOTAL revoked certificate(s)" +else + fail "CRL empty after revocation" +fi + +CERT_STATUS=$(api_get "/api/v1/certificates/mc-local-test" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "unknown") +if [ "$CERT_STATUS" = "Revoked" ]; then + pass "Certificate status updated to Revoked" +else + fail "Certificate status: $CERT_STATUS (expected Revoked)" +fi + +# --------------------------------------------------------------------------- +# PHASE 8: Discovery +# --------------------------------------------------------------------------- +header "Phase 8: Certificate Discovery" + +info "Checking discovered certificates..." +DISC_RESP=$(api_get "/api/v1/discovered-certificates" 2>/dev/null || echo '{"total":0}') +DISC_TOTAL=$(echo "$DISC_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total',0))" 2>/dev/null || echo 0) +if [ "$DISC_TOTAL" -ge 1 ]; then + pass "Discovered $DISC_TOTAL certificate(s) on filesystem" +else + skip "No discovered certificates yet (agent scan may not have run)" +fi + +SUMMARY_RESP=$(api_get "/api/v1/discovery-summary" 2>/dev/null || echo '{}') +echo -e " Discovery summary: $SUMMARY_RESP" + +# --------------------------------------------------------------------------- +# PHASE 9: Renewal (re-issue ACME cert) +# --------------------------------------------------------------------------- +header "Phase 9: Renewal" + +# Try mc-stepca-test first (mc-local-test was revoked in Phase 7). +# Fall back to mc-acme-test if step-ca cert isn't Active. +RENEWAL_CERT="" +for candidate in mc-stepca-test mc-acme-test; do + STATUS=$(api_get "/api/v1/certificates/$candidate" 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "unknown") + if [ "$STATUS" = "Active" ]; then + RENEWAL_CERT="$candidate" + break + fi +done + +if [ -z "$RENEWAL_CERT" ]; then + skip "Cannot test renewal — no certificate in Active state" +else + info "Using $RENEWAL_CERT for renewal test..." + info "Triggering renewal on $RENEWAL_CERT..." + RENEW_RESP=$(api_post "/api/v1/certificates/$RENEWAL_CERT/renew" 2>/dev/null || echo "ERROR") + if echo "$RENEW_RESP" | grep -q "renewal_triggered\|status"; then + pass "Renewal triggered" + else + skip "Renewal trigger returned: $RENEW_RESP" + fi + + info "Waiting for renewal to complete (up to 180s)..." + if wait_for_jobs_done "$RENEWAL_CERT" 180; then + pass "Renewal jobs completed" + + info "Reloading NGINX to pick up renewed certificate..." + docker exec certctl-test-nginx nginx -s reload 2>/dev/null || true + sleep 3 + + # Verify version history shows multiple versions + VERSIONS=$(api_get "/api/v1/certificates/$RENEWAL_CERT/versions" 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d) if isinstance(d, list) else d.get('total', 0))" 2>/dev/null || echo 0) + if [ "$VERSIONS" -ge 2 ]; then + pass "Certificate has $VERSIONS versions (original + renewal)" + else + skip "Expected 2+ versions, got $VERSIONS" + fi + else + skip "Renewal jobs did not complete within 180s" + fi +fi + +# --------------------------------------------------------------------------- +# PHASE 10: API Spot Checks +# --------------------------------------------------------------------------- +header "Phase 10: API Spot Checks" + +# Health +if api_get "/health" >/dev/null 2>&1; then + pass "GET /health returns 200" +else + fail "GET /health failed" +fi + +# Metrics +METRICS_RESP=$(api_get "/api/v1/metrics" 2>/dev/null || echo "ERROR") +if echo "$METRICS_RESP" | python3 -c "import sys,json; d=json.load(sys.stdin); assert 'gauge' in d" 2>/dev/null; then + pass "GET /api/v1/metrics returns valid JSON" +else + fail "Metrics endpoint broken" +fi + +# Stats summary +STATS_RESP=$(api_get "/api/v1/stats/summary" 2>/dev/null || echo "ERROR") +if echo "$STATS_RESP" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then + pass "GET /api/v1/stats/summary returns valid JSON" +else + fail "Stats summary endpoint broken" +fi + +# Audit trail +AUDIT_RESP=$(api_get "/api/v1/audit" 2>/dev/null || echo '{"total":0}') +AUDIT_TOTAL=$(echo "$AUDIT_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total',0))" 2>/dev/null || echo 0) +if [ "$AUDIT_TOTAL" -gt 0 ]; then + pass "Audit trail: $AUDIT_TOTAL events recorded" +else + fail "Audit trail empty" +fi + +# Jobs summary +JOBS_RESP=$(api_get "/api/v1/jobs" 2>/dev/null || echo '{"total":0}') +JOBS_TOTAL=$(echo "$JOBS_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total',0))" 2>/dev/null || echo 0) +pass "Total jobs created: $JOBS_TOTAL" + +# Prometheus +PROM_RESP=$(curl -sf -H "${AUTH_HEADER}" "${API_URL}/api/v1/metrics/prometheus" 2>/dev/null || echo "") +if echo "$PROM_RESP" | grep -q "certctl_certificate_total"; then + pass "Prometheus metrics endpoint working" +else + fail "Prometheus metrics endpoint broken" +fi + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +header "Test Summary" + +TOTAL=$((PASS + FAIL + SKIP)) +echo "" +echo -e " ${GREEN}Passed: $PASS${NC}" +echo -e " ${RED}Failed: $FAIL${NC}" +echo -e " ${YELLOW}Skipped: $SKIP${NC}" +echo -e " Total: $TOTAL" +echo "" + +if [ "$FAIL" -eq 0 ]; then + echo -e "${GREEN}${BOLD}All tests passed.${NC}" + exit 0 +else + echo -e "${RED}${BOLD}$FAIL test(s) failed.${NC}" + echo "" + echo "Useful debug commands:" + echo " docker logs certctl-test-server --tail 50" + echo " docker logs certctl-test-agent --tail 50" + echo " docker compose -f $COMPOSE_FILE ps" + exit 1 +fi diff --git a/deploy/test/setup-trust.sh b/deploy/test/setup-trust.sh new file mode 100755 index 0000000..690d1da --- /dev/null +++ b/deploy/test/setup-trust.sh @@ -0,0 +1,140 @@ +#!/bin/sh +# This script runs inside the certctl-server container at startup. +# It fetches CA certificates from Pebble and step-ca, adds them to the +# system trust store, then starts the certctl server. +# +# Why: The ACME connector and step-ca connector use Go's default http.Client +# with no InsecureSkipVerify. They rely on the system trust store to verify +# TLS connections. Pebble and step-ca both use self-signed root CAs that +# aren't in Alpine's default CA bundle, so we must add them manually. +# +# This script runs as root (user: "0:0" in docker-compose) so that +# update-ca-certificates can write to /etc/ssl/certs/. + +set -e + +echo "=== certctl trust store setup ===" + +# --- Pebble CA cert (fetched from management API) --- +# Pebble's management API serves the root CA at /roots/0. +# We use -k because we can't verify Pebble's TLS cert yet (chicken-and-egg). +echo "Fetching Pebble root CA from management API..." +PEBBLE_CA="" +for i in 1 2 3 4 5 6 7 8 9 10; do + if PEBBLE_CA=$(curl -sk https://pebble:15000/roots/0 2>/dev/null); then + if [ -n "$PEBBLE_CA" ]; then + echo "$PEBBLE_CA" > /usr/local/share/ca-certificates/pebble-ca.crt + echo " Added: Pebble test CA" + break + fi + fi + echo " Waiting for Pebble (attempt $i/10)..." + sleep 2 +done + +if [ -z "$PEBBLE_CA" ]; then + echo " WARNING: Could not fetch Pebble CA. ACME issuance will fail." +fi + +# --- step-ca root cert (from shared volume) --- +# The step-ca container writes its root CA to /home/step/certs/root_ca.crt. +# We mount the step-ca data volume at /stepca-data inside this container. +STEPCA_ROOT="/stepca-data/certs/root_ca.crt" +echo "Waiting for step-ca root cert..." +for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -f "$STEPCA_ROOT" ]; then + cp "$STEPCA_ROOT" /usr/local/share/ca-certificates/step-ca-root.crt + echo " Added: step-ca root CA" + break + fi + echo " Waiting for step-ca root cert (attempt $i/10)..." + sleep 2 +done + +if [ ! -f "$STEPCA_ROOT" ]; then + echo " WARNING: step-ca root cert not found at $STEPCA_ROOT" + echo " step-ca issuance may fail until the cert is available." +fi + +# --- step-ca provisioner key (extracted from ca.json) --- +# When step-ca auto-bootstraps via DOCKER_STEPCA_INIT_* env vars, the +# encrypted provisioner key (JWE) is NOT written as a separate file. +# Instead, it's embedded in ca.json under: +# authority.provisioners[0].encryptedKey +# We extract it here and write to /tmp so the certctl server can read it. +# The stepca_data volume is mounted :ro, so we can't write there. +STEPCA_CA_JSON="/stepca-data/config/ca.json" +STEPCA_KEY_EXTRACTED="/tmp/step-ca-provisioner-key" +echo "Extracting step-ca provisioner key from ca.json..." +for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -f "$STEPCA_CA_JSON" ]; then + # Extract the encryptedKey value using grep+sed (no jq in Alpine base) + # The field looks like: "encryptedKey": "eyJhbGciOi..." + ENCRYPTED_KEY=$(grep -o '"encryptedKey":"[^"]*"' "$STEPCA_CA_JSON" | head -1 | sed 's/"encryptedKey":"//;s/"$//') + if [ -z "$ENCRYPTED_KEY" ]; then + # Try with spaces around colon (JSON formatting varies) + ENCRYPTED_KEY=$(grep -o '"encryptedKey" *: *"[^"]*"' "$STEPCA_CA_JSON" | head -1 | sed 's/"encryptedKey" *: *"//;s/"$//') + fi + if [ -n "$ENCRYPTED_KEY" ]; then + # Check if it's JWE compact serialization (dot-separated) or JSON serialization + case "$ENCRYPTED_KEY" in + \{*) + # Already JSON serialization — write as-is + echo "$ENCRYPTED_KEY" > "$STEPCA_KEY_EXTRACTED" + ;; + *) + # JWE compact serialization: header.encrypted_key.iv.ciphertext.tag + # Convert to JSON serialization expected by Go decryptProvisionerKey() + JWE_PROTECTED=$(echo "$ENCRYPTED_KEY" | cut -d. -f1) + JWE_ENCKEY=$(echo "$ENCRYPTED_KEY" | cut -d. -f2) + JWE_IV=$(echo "$ENCRYPTED_KEY" | cut -d. -f3) + JWE_CT=$(echo "$ENCRYPTED_KEY" | cut -d. -f4) + JWE_TAG=$(echo "$ENCRYPTED_KEY" | cut -d. -f5) + printf '{"protected":"%s","encrypted_key":"%s","iv":"%s","ciphertext":"%s","tag":"%s"}' \ + "$JWE_PROTECTED" "$JWE_ENCKEY" "$JWE_IV" "$JWE_CT" "$JWE_TAG" > "$STEPCA_KEY_EXTRACTED" + ;; + esac + echo " Extracted provisioner key to $STEPCA_KEY_EXTRACTED" + echo " Key file size: $(wc -c < "$STEPCA_KEY_EXTRACTED") bytes" + echo " Key starts with: $(head -c 40 "$STEPCA_KEY_EXTRACTED")..." + # Override the env var so the server reads from the extracted file + export CERTCTL_STEPCA_KEY_PATH="$STEPCA_KEY_EXTRACTED" + break + else + echo " ca.json found but encryptedKey not found in it (attempt $i/10)" + fi + else + echo " Waiting for step-ca ca.json (attempt $i/10)..." + fi + sleep 2 +done + +if [ ! -f "$STEPCA_KEY_EXTRACTED" ]; then + echo " WARNING: Could not extract step-ca provisioner key" + echo " Listing /stepca-data/config/ for debugging:" + ls -la /stepca-data/config/ 2>/dev/null || echo " /stepca-data/config/ does not exist" + echo " step-ca issuance will fail." +fi + +# --- Update system trust store --- +echo "Updating system CA trust store..." +update-ca-certificates 2>/dev/null || true + +echo "Trust store updated." + +# --- Debug: verify configuration before starting server --- +echo "=== Pre-launch verification ===" +echo " CERTCTL_STEPCA_KEY_PATH=$CERTCTL_STEPCA_KEY_PATH" +if [ -f "$CERTCTL_STEPCA_KEY_PATH" ]; then + echo " step-ca key file exists ($(wc -c < "$CERTCTL_STEPCA_KEY_PATH") bytes)" + echo " step-ca key preview: $(head -c 60 "$CERTCTL_STEPCA_KEY_PATH")..." +else + echo " WARNING: step-ca key file NOT FOUND at $CERTCTL_STEPCA_KEY_PATH" +fi +echo " CERTCTL_ACME_DIRECTORY_URL=$CERTCTL_ACME_DIRECTORY_URL" +echo " CERTCTL_ACME_INSECURE=$CERTCTL_ACME_INSECURE" +echo " Pebble CA cert: $(ls -la /usr/local/share/ca-certificates/pebble-ca.crt 2>/dev/null || echo 'NOT FOUND')" +echo " step-ca root cert: $(ls -la /usr/local/share/ca-certificates/step-ca-root.crt 2>/dev/null || echo 'NOT FOUND')" +echo " System CA count: $(ls /etc/ssl/certs/*.pem 2>/dev/null | wc -l) PEM files" +echo "=== Starting certctl server ===" +exec /app/server diff --git a/go.mod b/go.mod index 61dfb8b..3a98100 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,10 @@ require ( github.com/testcontainers/testcontainers-go v0.35.0 ) -require golang.org/x/crypto v0.31.0 +require ( + golang.org/x/crypto v0.31.0 + software.sslmate.com/src/go-pkcs12 v0.7.0 +) require ( dario.cat/mergo v1.0.0 // indirect @@ -63,5 +66,4 @@ require ( golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sys v0.40.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - software.sslmate.com/src/go-pkcs12 v0.7.0 // indirect ) diff --git a/internal/api/handler/agents.go b/internal/api/handler/agents.go index 3c7773a..d92e005 100644 --- a/internal/api/handler/agents.go +++ b/internal/api/handler/agents.go @@ -252,6 +252,7 @@ func (h AgentHandler) AgentCSRSubmit(w http.ResponseWriter, r *http.Request) { } if err != nil { + slog.Error("CSR submission failed", "agent_id", agentID, "certificate_id", req.CertificateID, "error", err.Error()) ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to submit CSR", requestID) return } @@ -274,9 +275,10 @@ func (h AgentHandler) AgentCertificatePickup(w http.ResponseWriter, r *http.Requ requestID := middleware.GetRequestID(r.Context()) // Extract agent ID and certificate ID from path /api/v1/agents/{id}/certificates/{cert_id} + // After TrimPrefix, path is "{id}/certificates/{cert_id}" → split gives [id, "certificates", cert_id] path := strings.TrimPrefix(r.URL.Path, "/api/v1/agents/") parts := strings.Split(path, "/") - if len(parts) < 4 || parts[0] == "" || parts[2] == "" { + if len(parts) < 3 || parts[0] == "" || parts[2] == "" { ErrorWithRequestID(w, http.StatusBadRequest, "Agent ID and Certificate ID are required", requestID) return } diff --git a/internal/config/config.go b/internal/config/config.go index cfc98db..96352ef 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -256,6 +256,11 @@ type ACMEConfig struct { // Default: false. Requires a CA that supports ARI (e.g., Let's Encrypt). // Setting: CERTCTL_ACME_ARI_ENABLED environment variable. ARIEnabled bool + + // Insecure skips TLS certificate verification when connecting to the ACME directory. + // Only use for testing with self-signed ACME servers like Pebble. Never in production. + // Setting: CERTCTL_ACME_INSECURE environment variable. + Insecure bool } // OpenSSLConfig contains OpenSSL/Custom CA issuer connector configuration. @@ -503,6 +508,7 @@ func Load() (*Config, error) { DNSCleanUpScript: getEnv("CERTCTL_ACME_DNS_CLEANUP_SCRIPT", ""), DNSPersistIssuerDomain: getEnv("CERTCTL_ACME_DNS_PERSIST_ISSUER_DOMAIN", ""), ARIEnabled: getEnvBool("CERTCTL_ACME_ARI_ENABLED", false), + Insecure: getEnvBool("CERTCTL_ACME_INSECURE", false), }, Digest: DigestConfig{ Enabled: getEnvBool("CERTCTL_DIGEST_ENABLED", false), diff --git a/internal/connector/issuer/acme/acme.go b/internal/connector/issuer/acme/acme.go index 5a53011..74a52c2 100644 --- a/internal/connector/issuer/acme/acme.go +++ b/internal/connector/issuer/acme/acme.go @@ -5,6 +5,7 @@ import ( "crypto/ecdsa" "crypto/elliptic" "crypto/rand" + "crypto/tls" "crypto/x509" "encoding/base64" "encoding/json" @@ -58,6 +59,10 @@ type Config struct { // ARIEnabled enables ACME Renewal Information (RFC 9702) support per CERTCTL_ACME_ARI_ENABLED. // When enabled, the connector queries the CA's ARI endpoint to get CA-directed renewal timing. ARIEnabled bool `json:"ari_enabled,omitempty"` + + // Insecure skips TLS certificate verification when connecting to the ACME directory. + // Only use for testing with self-signed ACME servers like Pebble. + Insecure bool `json:"insecure,omitempty"` } // Connector implements the issuer.Connector interface for ACME-compatible CAs @@ -114,6 +119,18 @@ func New(config *Config, logger *slog.Logger) *Connector { return c } +// httpClient returns an HTTP client configured for the ACME connector. +// When Insecure is true (e.g., for Pebble test servers), TLS verification is skipped. +func (c *Connector) httpClient() *http.Client { + client := &http.Client{Timeout: 30 * time.Second} + if c.config != nil && c.config.Insecure { + client.Transport = &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, //nolint:gosec // Intentional for test ACME servers (Pebble) + } + } + return client +} + // ValidateConfig checks that the ACME directory URL is reachable and valid. func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessage) error { var cfg Config @@ -129,10 +146,16 @@ func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessag return fmt.Errorf("ACME email is required") } - c.logger.Info("validating ACME configuration", "directory_url", cfg.DirectoryURL) + c.logger.Info("validating ACME configuration", "directory_url", cfg.DirectoryURL, "insecure", cfg.Insecure) + + // Apply config so httpClient() can use it for the directory probe. + // This persists across the function — if validation fails early, the config + // will still be set, but that's fine since a failed ValidateConfig means + // the connector won't be used. + c.config = &cfg // Verify that the directory URL is reachable - httpClient := &http.Client{Timeout: 10 * time.Second} + httpClient := c.httpClient() req, err := http.NewRequestWithContext(ctx, http.MethodGet, cfg.DirectoryURL, nil) if err != nil { return fmt.Errorf("failed to create request: %w", err) @@ -203,6 +226,7 @@ func (c *Connector) ensureClient(ctx context.Context) error { c.client = &acme.Client{ Key: key, DirectoryURL: c.config.DirectoryURL, + HTTPClient: c.httpClient(), } // Register or retrieve the ACME account @@ -338,6 +362,12 @@ func (c *Connector) IssueCertificate(ctx context.Context, request issuer.Issuanc } c.logger.Info("ACME order created", "order_url", order.URI, "status", order.Status) + // Save FinalizeURL and URI before WaitOrder — WaitOrder returns a new Order + // object that may have empty FinalizeURL and URI fields (Go's crypto/acme + // WaitOrder doesn't populate Order.URI on the returned struct). + finalizeURL := order.FinalizeURL + orderURI := order.URI + // Step 2: Solve authorizations (HTTP-01 challenges) if order.Status == acme.StatusPending { if err := c.solveAuthorizations(ctx, order.AuthzURLs); err != nil { @@ -345,10 +375,18 @@ func (c *Connector) IssueCertificate(ctx context.Context, request issuer.Issuanc } // Wait for the order to be ready - order, err = c.client.WaitOrder(ctx, order.URI) + order, err = c.client.WaitOrder(ctx, orderURI) if err != nil { return nil, fmt.Errorf("order failed after challenge: %w", err) } + // Update finalizeURL from the waited order if it has one + if order.FinalizeURL != "" { + finalizeURL = order.FinalizeURL + } + // Preserve orderURI — WaitOrder doesn't populate Order.URI + if order.URI != "" { + orderURI = order.URI + } } if order.Status != acme.StatusReady { @@ -361,9 +399,39 @@ func (c *Connector) IssueCertificate(ctx context.Context, request issuer.Issuanc return nil, fmt.Errorf("failed to parse CSR: %w", err) } - derChain, _, err := c.client.CreateOrderCert(ctx, order.FinalizeURL, csrDER, true) + if finalizeURL == "" { + return nil, fmt.Errorf("ACME order has no finalize URL (order URI: %s, status: %s)", order.URI, order.Status) + } + + // Step 3b: Finalize the order and fetch the certificate. + // CreateOrderCert POSTs the CSR to the finalize URL and attempts to retrieve + // the certificate. Some ACME servers (notably Pebble) return the order object + // per RFC 8555 rather than redirecting to the cert, which can cause + // CreateOrderCert's internal cert URL resolution to fail. In that case, we + // fall back to WaitOrder (to get the CertURL) + FetchCert. + derChain, _, err := c.client.CreateOrderCert(ctx, finalizeURL, csrDER, true) if err != nil { - return nil, fmt.Errorf("failed to finalize order: %w", err) + c.logger.Warn("CreateOrderCert failed, attempting manual certificate fetch", + "error", err, "order_uri", orderURI) + + // The finalize POST likely succeeded (the CA issued the cert) but cert + // retrieval failed. WaitOrder returns the order in "valid" state with + // CertURL populated. + validOrder, waitErr := c.client.WaitOrder(ctx, orderURI) + if waitErr != nil { + return nil, fmt.Errorf("failed to finalize order: %w (wait fallback: %v)", err, waitErr) + } + + if validOrder.CertURL == "" { + return nil, fmt.Errorf("order finalized but no certificate URL returned (original error: %w)", err) + } + + c.logger.Info("fetching certificate via fallback", "cert_url", validOrder.CertURL) + fetchedChain, fetchErr := c.client.FetchCert(ctx, validOrder.CertURL, true) + if fetchErr != nil { + return nil, fmt.Errorf("failed to fetch certificate: %w (original finalize error: %v)", fetchErr, err) + } + derChain = fetchedChain } if len(derChain) == 0 { @@ -387,7 +455,7 @@ func (c *Connector) IssueCertificate(ctx context.Context, request issuer.Issuanc Serial: serial, NotBefore: notBefore, NotAfter: notAfter, - OrderID: order.URI, + OrderID: orderURI, }, nil } diff --git a/internal/connector/issuer/stepca/jwe.go b/internal/connector/issuer/stepca/jwe.go new file mode 100644 index 0000000..45c09ea --- /dev/null +++ b/internal/connector/issuer/stepca/jwe.go @@ -0,0 +1,300 @@ +// Package stepca — JWE decryption for step-ca provisioner keys. +// +// step-ca stores provisioner private keys as JWE-encrypted JSON files using: +// - Algorithm: PBES2-HS256+A128KW (PBKDF2 key derivation + AES-128 Key Wrap) +// - Encryption: A128GCM (AES-128 in GCM mode) +// +// This file implements just enough JWE to decrypt these files without requiring +// an external JOSE library. Uses only stdlib + golang.org/x/crypto/pbkdf2. +package stepca + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/sha256" + "encoding/base64" + "encoding/binary" + "encoding/json" + "fmt" + "math/big" + + "golang.org/x/crypto/pbkdf2" +) + +// jweJSON is the JWE JSON Serialization format used by step-ca provisioner keys. +type jweJSON struct { + Protected string `json:"protected"` + EncryptedKey string `json:"encrypted_key"` + IV string `json:"iv"` + Ciphertext string `json:"ciphertext"` + Tag string `json:"tag"` +} + +// jweHeader is the protected header inside a step-ca provisioner key JWE. +type jweHeader struct { + Alg string `json:"alg"` // "PBES2-HS256+A128KW" + Enc string `json:"enc"` // "A128GCM" + Cty string `json:"cty"` // "jwk+json" + P2s string `json:"p2s"` // PBKDF2 salt (base64url) + P2c int `json:"p2c"` // PBKDF2 iteration count +} + +// jwkEC is a minimal JWK representation for EC private keys. +type jwkEC struct { + Kty string `json:"kty"` + Crv string `json:"crv"` + X string `json:"x"` + Y string `json:"y"` + D string `json:"d"` + Kid string `json:"kid"` +} + +// decryptProvisionerKey decrypts a step-ca JWE-encrypted provisioner key file. +// Returns the parsed ECDSA private key and the key ID (kid). +func decryptProvisionerKey(jweData []byte, password string) (*ecdsa.PrivateKey, string, error) { + // Parse JWE JSON + var jwe jweJSON + if err := json.Unmarshal(jweData, &jwe); err != nil { + return nil, "", fmt.Errorf("failed to parse JWE JSON: %w", err) + } + + // Decode protected header + headerBytes, err := base64.RawURLEncoding.DecodeString(jwe.Protected) + if err != nil { + return nil, "", fmt.Errorf("failed to decode JWE protected header: %w", err) + } + + var header jweHeader + if err := json.Unmarshal(headerBytes, &header); err != nil { + return nil, "", fmt.Errorf("failed to parse JWE header: %w", err) + } + + if header.Alg != "PBES2-HS256+A128KW" { + return nil, "", fmt.Errorf("unsupported JWE algorithm: %s (expected PBES2-HS256+A128KW)", header.Alg) + } + if header.Enc != "A128GCM" && header.Enc != "A256GCM" { + return nil, "", fmt.Errorf("unsupported JWE encryption: %s (expected A128GCM or A256GCM)", header.Enc) + } + + // Decode PBKDF2 salt + p2sSalt, err := base64.RawURLEncoding.DecodeString(header.P2s) + if err != nil { + return nil, "", fmt.Errorf("failed to decode PBKDF2 salt: %w", err) + } + + // Decode encrypted key, IV, ciphertext, tag + encryptedKey, err := base64.RawURLEncoding.DecodeString(jwe.EncryptedKey) + if err != nil { + return nil, "", fmt.Errorf("failed to decode encrypted key: %w", err) + } + + iv, err := base64.RawURLEncoding.DecodeString(jwe.IV) + if err != nil { + return nil, "", fmt.Errorf("failed to decode IV: %w", err) + } + + ciphertext, err := base64.RawURLEncoding.DecodeString(jwe.Ciphertext) + if err != nil { + return nil, "", fmt.Errorf("failed to decode ciphertext: %w", err) + } + + tag, err := base64.RawURLEncoding.DecodeString(jwe.Tag) + if err != nil { + return nil, "", fmt.Errorf("failed to decode tag: %w", err) + } + + // Step 1: Derive Key Encryption Key (KEK) using PBKDF2 + // PBES2-HS256+A128KW: PBKDF2-SHA256, 16-byte derived key for AES-128 Key Wrap + // The salt for PBKDF2 is: UTF8(alg) || 0x00 || p2s + algBytes := []byte(header.Alg) + salt := make([]byte, len(algBytes)+1+len(p2sSalt)) + copy(salt, algBytes) + salt[len(algBytes)] = 0x00 + copy(salt[len(algBytes)+1:], p2sSalt) + + kekSize := 16 // AES-128 for A128KW + kek := pbkdf2.Key([]byte(password), salt, header.P2c, kekSize, sha256.New) + + // Step 2: AES Key Unwrap (RFC 3394) to get the Content Encryption Key (CEK) + cek, err := aesKeyUnwrap(kek, encryptedKey) + if err != nil { + return nil, "", fmt.Errorf("AES key unwrap failed (wrong password?): %w", err) + } + + // Step 3: AES-GCM decrypt the payload + // AAD = ASCII(BASE64URL(protected header)) + aad := []byte(jwe.Protected) + + block, err := aes.NewCipher(cek) + if err != nil { + return nil, "", fmt.Errorf("failed to create AES cipher: %w", err) + } + + gcm, err := cipher.NewGCM(block) + if err != nil { + return nil, "", fmt.Errorf("failed to create GCM: %w", err) + } + + // GCM expects ciphertext+tag concatenated + sealed := append(ciphertext, tag...) + plaintext, err := gcm.Open(nil, iv, sealed, aad) + if err != nil { + return nil, "", fmt.Errorf("GCM decryption failed: %w", err) + } + + // Step 4: Parse the decrypted JWK + var jwk jwkEC + if err := json.Unmarshal(plaintext, &jwk); err != nil { + return nil, "", fmt.Errorf("failed to parse decrypted JWK: %w", err) + } + + if jwk.Kty != "EC" { + return nil, "", fmt.Errorf("unsupported JWK key type: %s (expected EC)", jwk.Kty) + } + + key, err := jwkToECDSA(&jwk) + if err != nil { + return nil, "", err + } + + return key, jwk.Kid, nil +} + +// jwkToECDSA converts a JWK EC key to an *ecdsa.PrivateKey. +func jwkToECDSA(jwk *jwkEC) (*ecdsa.PrivateKey, error) { + var curve elliptic.Curve + switch jwk.Crv { + case "P-256": + curve = elliptic.P256() + case "P-384": + curve = elliptic.P384() + case "P-521": + curve = elliptic.P521() + default: + return nil, fmt.Errorf("unsupported curve: %s", jwk.Crv) + } + + xBytes, err := base64.RawURLEncoding.DecodeString(jwk.X) + if err != nil { + return nil, fmt.Errorf("failed to decode JWK x: %w", err) + } + yBytes, err := base64.RawURLEncoding.DecodeString(jwk.Y) + if err != nil { + return nil, fmt.Errorf("failed to decode JWK y: %w", err) + } + dBytes, err := base64.RawURLEncoding.DecodeString(jwk.D) + if err != nil { + return nil, fmt.Errorf("failed to decode JWK d: %w", err) + } + + key := &ecdsa.PrivateKey{ + PublicKey: ecdsa.PublicKey{ + Curve: curve, + X: new(big.Int).SetBytes(xBytes), + Y: new(big.Int).SetBytes(yBytes), + }, + D: new(big.Int).SetBytes(dBytes), + } + + return key, nil +} + +// ecdsaPublicKeyToJWK converts an ECDSA public key to a JWK map for JWT header embedding. +func ecdsaPublicKeyToJWK(key *ecdsa.PublicKey) map[string]string { + var crv string + var size int + switch key.Curve { + case elliptic.P256(): + crv = "P-256" + size = 32 + case elliptic.P384(): + crv = "P-384" + size = 48 + case elliptic.P521(): + crv = "P-521" + size = 66 + default: + crv = "unknown" + size = 32 + } + + xBytes := key.X.Bytes() + yBytes := key.Y.Bytes() + + // Pad to fixed size + xPadded := make([]byte, size) + yPadded := make([]byte, size) + copy(xPadded[size-len(xBytes):], xBytes) + copy(yPadded[size-len(yBytes):], yBytes) + + return map[string]string{ + "kty": "EC", + "crv": crv, + "x": base64.RawURLEncoding.EncodeToString(xPadded), + "y": base64.RawURLEncoding.EncodeToString(yPadded), + } +} + +// aesKeyUnwrap implements AES Key Unwrap per RFC 3394. +func aesKeyUnwrap(kek, ciphertext []byte) ([]byte, error) { + if len(ciphertext)%8 != 0 || len(ciphertext) < 24 { + return nil, fmt.Errorf("invalid ciphertext length for AES Key Unwrap: %d", len(ciphertext)) + } + + block, err := aes.NewCipher(kek) + if err != nil { + return nil, fmt.Errorf("failed to create AES cipher: %w", err) + } + + n := (len(ciphertext) / 8) - 1 // number of 64-bit key data blocks + + // Initialize + a := make([]byte, 8) + copy(a, ciphertext[:8]) + + r := make([][]byte, n) + for i := 0; i < n; i++ { + r[i] = make([]byte, 8) + copy(r[i], ciphertext[(i+1)*8:(i+2)*8]) + } + + // Unwrap: 6 rounds + buf := make([]byte, 16) + for j := 5; j >= 0; j-- { + for i := n; i >= 1; i-- { + // A ^= (n*j + i) encoded as big-endian uint64 + t := uint64(n*j + i) + tBytes := make([]byte, 8) + binary.BigEndian.PutUint64(tBytes, t) + for k := 0; k < 8; k++ { + a[k] ^= tBytes[k] + } + + // B = AES-1(KEK, A || R[i]) + copy(buf[:8], a) + copy(buf[8:], r[i-1]) + block.Decrypt(buf, buf) + + copy(a, buf[:8]) + copy(r[i-1], buf[8:]) + } + } + + // Check the integrity check value (must be 0xA6A6A6A6A6A6A6A6) + defaultIV := []byte{0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6} + for i := 0; i < 8; i++ { + if a[i] != defaultIV[i] { + return nil, fmt.Errorf("AES Key Unwrap integrity check failed") + } + } + + // Concatenate unwrapped key data + result := make([]byte, 0, n*8) + for i := 0; i < n; i++ { + result = append(result, r[i]...) + } + + return result, nil +} diff --git a/internal/connector/issuer/stepca/stepca.go b/internal/connector/issuer/stepca/stepca.go index a0a3e12..30c91b6 100644 --- a/internal/connector/issuer/stepca/stepca.go +++ b/internal/connector/issuer/stepca/stepca.go @@ -27,6 +27,7 @@ import ( "crypto/elliptic" "crypto/rand" "crypto/sha256" + "crypto/tls" "crypto/x509" "encoding/base64" "encoding/json" @@ -74,17 +75,37 @@ type Connector struct { } // New creates a new step-ca connector with the given configuration and logger. +// If RootCertPath is set, the HTTP client will trust that CA certificate for TLS connections. +// Otherwise, the system trust store is used (which works if setup-trust.sh has run). func New(config *Config, logger *slog.Logger) *Connector { - if config != nil && config.ValidityDays == 0 { - config.ValidityDays = 90 + // Don't default ValidityDays — let step-ca use its own default duration. + // Operators can explicitly set ValidityDays if their step-ca is configured + // with longer max durations. A zero value means "omit from sign request." + + httpClient := &http.Client{Timeout: 30 * time.Second} + + // Load custom root CA cert if provided + if config != nil && config.RootCertPath != "" { + rootPEM, err := os.ReadFile(config.RootCertPath) + if err == nil { + pool := x509.NewCertPool() + if pool.AppendCertsFromPEM(rootPEM) { + httpClient.Transport = &http.Transport{ + TLSClientConfig: &tls.Config{ + RootCAs: pool, + }, + } + logger.Info("step-ca custom root CA loaded", "path", config.RootCertPath) + } + } else { + logger.Warn("failed to read step-ca root cert, using system trust store", "path", config.RootCertPath, "error", err) + } } return &Connector{ - config: config, - logger: logger, - httpClient: &http.Client{ - Timeout: 30 * time.Second, - }, + config: config, + logger: logger, + httpClient: httpClient, } } @@ -103,9 +124,7 @@ func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessag return fmt.Errorf("step-ca provisioner_name is required") } - if cfg.ValidityDays == 0 { - cfg.ValidityDays = 90 - } + // Don't default ValidityDays — 0 means "let step-ca use its own default duration" // Check CA health healthURL := cfg.CAURL + "/health" @@ -174,15 +193,18 @@ func (c *Connector) IssueCertificate(ctx context.Context, request issuer.Issuanc return nil, fmt.Errorf("failed to generate provisioner token: %w", err) } - // Build the sign request - now := time.Now() - notAfter := now.AddDate(0, 0, c.config.ValidityDays) - + // Build the sign request. + // When ValidityDays is 0 (default), omit NotBefore/NotAfter so step-ca uses its + // own default duration (typically 24h). The signRequest struct has omitempty on + // both time fields, so zero-value time.Time{} gets stripped from the JSON. signReq := signRequest{ - CsrPEM: request.CSRPEM, - OTT: ott, - NotBefore: now, - NotAfter: notAfter, + CsrPEM: request.CSRPEM, + OTT: ott, + } + if c.config.ValidityDays > 0 { + now := time.Now() + signReq.NotBefore = now + signReq.NotAfter = now.AddDate(0, 0, c.config.ValidityDays) } body, err := json.Marshal(signReq) @@ -318,39 +340,80 @@ func (c *Connector) GetOrderStatus(ctx context.Context, orderID string) (*issuer } // generateProvisionerToken creates a short-lived JWT (One-Time Token) for step-ca API calls. -// This is a minimal JWT signed with the provisioner's key. +// The JWT is signed with the provisioner's private key (loaded from the encrypted JWE file +// at ProvisionerKeyPath and decrypted with ProvisionerPassword). func (c *Connector) generateProvisionerToken(subject string, sans []string) (string, error) { - // For the initial implementation, we generate a simple self-signed JWT. - // In production, the provisioner key would be loaded from the configured path. - // step-ca expects a JWT with: sub=, iss=, aud=/sign + var key *ecdsa.PrivateKey + var kid string + + if c.config.ProvisionerKeyPath != "" { + // Production: load and decrypt the real provisioner key from disk + var err error + key, kid, err = c.loadProvisionerKey() + if err != nil { + return "", fmt.Errorf("failed to load provisioner key: %w", err) + } + } else { + // Fallback: generate an ephemeral key (for testing or when key path not configured). + // This won't authenticate with a real step-ca server, but allows the connector + // to function against mock servers in tests. + c.logger.Warn("no provisioner key path configured, using ephemeral key (will not work with real step-ca)") + var err error + key, err = ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return "", fmt.Errorf("failed to generate ephemeral key: %w", err) + } + kid = "ephemeral" + } now := time.Now() + // step-ca expects: aud = /1.0/sign (the sign endpoint audience) claims := map[string]interface{}{ "sub": subject, "iss": c.config.ProvisionerName, - "aud": c.config.CAURL + "/sign", + "aud": c.config.CAURL + "/1.0/sign", "nbf": now.Unix(), "iat": now.Unix(), "exp": now.Add(5 * time.Minute).Unix(), "jti": generateJTI(), - "sha": c.config.ProvisionerName, // step-ca uses this for key lookup + "sha": kid, // step-ca uses this to look up the provisioner by key fingerprint } if len(sans) > 0 { claims["sans"] = sans } - // Generate an ephemeral signing key for the token. - // In a full implementation, this would use the provisioner key from disk. - // For now, we use an ephemeral key — step-ca administrators should configure - // the provisioner to accept tokens from this key. - key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) - if err != nil { - return "", fmt.Errorf("failed to generate token signing key: %w", err) + return signJWTWithKID(claims, key, kid) +} + +// loadProvisionerKey loads and decrypts the step-ca provisioner key from disk. +// Returns the ECDSA private key and the key ID (JWK thumbprint). +func (c *Connector) loadProvisionerKey() (*ecdsa.PrivateKey, string, error) { + if c.config.ProvisionerKeyPath == "" { + return nil, "", fmt.Errorf("provisioner_key_path is required for step-ca JWK authentication") } - return signJWT(claims, key) + jweData, err := os.ReadFile(c.config.ProvisionerKeyPath) + if err != nil { + return nil, "", fmt.Errorf("failed to read provisioner key file %s: %w", c.config.ProvisionerKeyPath, err) + } + + password := c.config.ProvisionerPassword + if password == "" { + return nil, "", fmt.Errorf("provisioner_password is required to decrypt the provisioner key") + } + + key, kid, err := decryptProvisionerKey(jweData, password) + if err != nil { + return nil, "", fmt.Errorf("failed to decrypt provisioner key: %w", err) + } + + c.logger.Info("provisioner key loaded and decrypted", + "key_path", c.config.ProvisionerKeyPath, + "kid", kid) + + return key, kid, nil } // generateJTI creates a unique JWT ID. @@ -360,14 +423,31 @@ func generateJTI() string { return base64.RawURLEncoding.EncodeToString(b) } -// signJWT creates a minimal ES256 JWT from the given claims. +// signJWTWithKID creates an ES256 JWT with a key ID in the header. +func signJWTWithKID(claims map[string]interface{}, key *ecdsa.PrivateKey, kid string) (string, error) { + // Header with kid so step-ca can look up the provisioner + header := map[string]string{ + "alg": "ES256", + "typ": "JWT", + "kid": kid, + } + + return signJWTRaw(claims, key, header) +} + +// signJWT creates a minimal ES256 JWT from the given claims (no kid). func signJWT(claims map[string]interface{}, key *ecdsa.PrivateKey) (string, error) { - // Header header := map[string]string{ "alg": "ES256", "typ": "JWT", } + return signJWTRaw(claims, key, header) +} + +// signJWTRaw creates an ES256 JWT from the given claims and header. +func signJWTRaw(claims map[string]interface{}, key *ecdsa.PrivateKey, header map[string]string) (string, error) { + headerJSON, err := json.Marshal(header) if err != nil { return "", err diff --git a/internal/connector/target/nginx/nginx.go b/internal/connector/target/nginx/nginx.go index cce4037..91a3b5f 100644 --- a/internal/connector/target/nginx/nginx.go +++ b/internal/connector/target/nginx/nginx.go @@ -7,6 +7,7 @@ import ( "log/slog" "os" "os/exec" + "path/filepath" "time" "github.com/shankar0123/certctl/internal/connector/target" @@ -67,13 +68,13 @@ func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessag "chain_path", cfg.ChainPath) // Verify directory exists and is writable - certDir := cfg.CertPath[:len(cfg.CertPath)-len("/cert.pem")] // Simple path extraction + certDir := filepath.Dir(cfg.CertPath) if _, err := os.Stat(certDir); os.IsNotExist(err) { return fmt.Errorf("NGINX cert directory does not exist: %s", certDir) } // Verify validate command works - cmd := exec.CommandContext(ctx, cfg.ValidateCommand) + cmd := exec.CommandContext(ctx, "sh", "-c", cfg.ValidateCommand) if err := cmd.Run(); err != nil { c.logger.Warn("NGINX config validation failed during config check", "error", err, @@ -115,20 +116,37 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy } // Write chain with same permissions - if err := os.WriteFile(c.config.ChainPath, []byte(request.ChainPEM), 0644); err != nil { - errMsg := fmt.Sprintf("failed to write chain: %v", err) - c.logger.Error("chain deployment failed", "error", err) - return &target.DeploymentResult{ - Success: false, - TargetAddress: c.config.ChainPath, - Message: errMsg, - DeployedAt: time.Now(), - }, fmt.Errorf("%s", errMsg) + if c.config.ChainPath != "" { + if err := os.WriteFile(c.config.ChainPath, []byte(request.ChainPEM), 0644); err != nil { + errMsg := fmt.Sprintf("failed to write chain: %v", err) + c.logger.Error("chain deployment failed", "error", err) + return &target.DeploymentResult{ + Success: false, + TargetAddress: c.config.ChainPath, + Message: errMsg, + DeployedAt: time.Now(), + }, fmt.Errorf("%s", errMsg) + } + } + + // Write private key if provided and key_path is configured + if c.config.KeyPath != "" && request.KeyPEM != "" { + if err := os.WriteFile(c.config.KeyPath, []byte(request.KeyPEM), 0600); err != nil { + errMsg := fmt.Sprintf("failed to write private key: %v", err) + c.logger.Error("key deployment failed", "error", err) + return &target.DeploymentResult{ + Success: false, + TargetAddress: c.config.KeyPath, + Message: errMsg, + DeployedAt: time.Now(), + }, fmt.Errorf("%s", errMsg) + } + c.logger.Info("private key written", "key_path", c.config.KeyPath) } // Validate NGINX configuration before reload c.logger.Debug("validating NGINX configuration", "validate_command", c.config.ValidateCommand) - validateCmd := exec.CommandContext(ctx, c.config.ValidateCommand) + validateCmd := exec.CommandContext(ctx, "sh", "-c", c.config.ValidateCommand) if output, err := validateCmd.CombinedOutput(); err != nil { errMsg := fmt.Sprintf("NGINX config validation failed: %v (output: %s)", err, string(output)) c.logger.Error("NGINX validation failed", "error", err, "output", string(output)) @@ -142,7 +160,7 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy // Reload NGINX c.logger.Debug("reloading NGINX", "reload_command", c.config.ReloadCommand) - reloadCmd := exec.CommandContext(ctx, c.config.ReloadCommand) + reloadCmd := exec.CommandContext(ctx, "sh", "-c", c.config.ReloadCommand) if output, err := reloadCmd.CombinedOutput(); err != nil { errMsg := fmt.Sprintf("NGINX reload failed: %v (output: %s)", err, string(output)) c.logger.Error("NGINX reload failed", "error", err, "output", string(output)) @@ -187,7 +205,7 @@ func (c *Connector) ValidateDeployment(ctx context.Context, request target.Valid startTime := time.Now() // Validate NGINX configuration - validateCmd := exec.CommandContext(ctx, c.config.ValidateCommand) + validateCmd := exec.CommandContext(ctx, "sh", "-c", c.config.ValidateCommand) if err := validateCmd.Run(); err != nil { errMsg := fmt.Sprintf("NGINX config validation failed: %v", err) c.logger.Error("validation failed", "error", err) diff --git a/internal/service/agent.go b/internal/service/agent.go index e18c60a..8ab968d 100644 --- a/internal/service/agent.go +++ b/internal/service/agent.go @@ -178,14 +178,15 @@ func (s *AgentService) SubmitCSR(ctx context.Context, agentID string, certID str } version := &domain.CertificateVersion{ - ID: generateID("certver"), - CertificateID: certID, - SerialNumber: result.Serial, - NotBefore: result.NotBefore, - NotAfter: result.NotAfter, - PEMChain: result.CertPEM + "\n" + result.ChainPEM, - CSRPEM: string(csrPEM), - CreatedAt: time.Now(), + ID: generateID("certver"), + CertificateID: certID, + SerialNumber: result.Serial, + NotBefore: result.NotBefore, + NotAfter: result.NotAfter, + FingerprintSHA256: computeCertFingerprint(result.CertPEM), + PEMChain: result.CertPEM + "\n" + result.ChainPEM, + CSRPEM: string(csrPEM), + CreatedAt: time.Now(), } if err := s.certRepo.CreateVersion(ctx, version); err != nil { diff --git a/internal/service/certificate.go b/internal/service/certificate.go index 317554a..9744ce5 100644 --- a/internal/service/certificate.go +++ b/internal/service/certificate.go @@ -14,10 +14,12 @@ import ( type CertificateService struct { certRepo repository.CertificateRepository targetRepo repository.TargetRepository + jobRepo repository.JobRepository policyService *PolicyService auditService *AuditService revSvc *RevocationSvc caSvc *CAOperationsSvc + keygenMode string } // NewCertificateService creates a new certificate service. @@ -48,6 +50,16 @@ func (s *CertificateService) SetTargetRepo(repo repository.TargetRepository) { s.targetRepo = repo } +// SetJobRepo sets the job repository for creating renewal/issuance jobs. +func (s *CertificateService) SetJobRepo(repo repository.JobRepository) { + s.jobRepo = repo +} + +// SetKeygenMode sets the key generation mode (agent or server). +func (s *CertificateService) SetKeygenMode(mode string) { + s.keygenMode = mode +} + // List returns a paginated list of certificates matching the filter. func (s *CertificateService) List(ctx context.Context, filter *repository.CertificateFilter) ([]*domain.ManagedCertificate, int, error) { certs, total, err := s.certRepo.List(ctx, filter) @@ -195,6 +207,8 @@ func (s *CertificateService) GetVersions(ctx context.Context, certID string) ([] } // TriggerRenewalWithActor initiates a renewal job if the certificate is eligible. +// Creates a Renewal job (or Issuance for new certs) so the scheduler's job processor +// can pick it up and route it through the issuer connector. func (s *CertificateService) TriggerRenewalWithActor(ctx context.Context, certID string, actor string) error { cert, err := s.certRepo.Get(ctx, certID) if err != nil { @@ -220,6 +234,45 @@ func (s *CertificateService) TriggerRenewalWithActor(ctx context.Context, certID return fmt.Errorf("failed to update certificate status: %w", err) } + // Create a renewal job so the job processor can pick it up. + // In agent keygen mode, the job starts as AwaitingCSR so the agent + // generates the key pair and submits a CSR. In server mode, it starts as Pending. + if s.jobRepo != nil { + jobStatus := domain.JobStatusPending + if s.keygenMode == "agent" { + jobStatus = domain.JobStatusAwaitingCSR + } + + // Determine job type: Issuance for certs that have never been issued, + // Renewal for certs that already have a version. + jobType := domain.JobTypeRenewal + if cert.ExpiresAt.IsZero() || cert.ExpiresAt.Year() < 2000 { + jobType = domain.JobTypeIssuance + } + + job := &domain.Job{ + ID: generateID("job"), + CertificateID: cert.ID, + Type: jobType, + Status: jobStatus, + MaxAttempts: 3, + ScheduledAt: time.Now(), + CreatedAt: time.Now(), + } + + if err := s.jobRepo.Create(ctx, job); err != nil { + slog.Error("failed to create renewal job", "cert_id", cert.ID, "error", err) + return fmt.Errorf("failed to create renewal job: %w", err) + } + + slog.Info("created renewal job via API trigger", + "job_id", job.ID, + "cert_id", cert.ID, + "job_type", string(jobType), + "job_status", string(jobStatus), + "keygen_mode", s.keygenMode) + } + // Record audit event if err := s.auditService.RecordEvent(ctx, actor, domain.ActorTypeUser, "renewal_triggered", "certificate", certID, diff --git a/internal/service/job.go b/internal/service/job.go index c6a6d8f..b4b7650 100644 --- a/internal/service/job.go +++ b/internal/service/job.go @@ -54,6 +54,16 @@ func (s *JobService) ProcessPendingJobs(ctx context.Context) error { // Process each job for _, job := range pendingJobs { + // Skip deployment jobs that have an agent_id — those are meant for agent + // pickup via GetPendingWork(), not server-side processing. The server should + // only process deployment jobs without an agent (legacy/serverless targets). + if job.Type == domain.JobTypeDeployment && job.AgentID != nil && *job.AgentID != "" { + s.logger.Debug("skipping agent-routed deployment job", + "job_id", job.ID, + "agent_id", *job.AgentID) + continue + } + if err := s.processJob(ctx, job); err != nil { s.logger.Error("failed to process job", "job_id", job.ID, diff --git a/internal/service/renewal.go b/internal/service/renewal.go index 7112e0f..c7b3bbf 100644 --- a/internal/service/renewal.go +++ b/internal/service/renewal.go @@ -636,23 +636,50 @@ func (s *RenewalService) CompleteAgentCSRRenewal(ctx context.Context, job *domai } // createDeploymentJobs creates pending deployment jobs for each target associated with a cert. +// If cert.TargetIDs is empty (common — the repository doesn't populate this field), +// falls back to querying certificate_target_mappings via targetRepo.ListByCertificate. func (s *RenewalService) createDeploymentJobs(ctx context.Context, cert *domain.ManagedCertificate) { - if len(cert.TargetIDs) == 0 { + // Resolve targets: prefer in-memory TargetIDs, fall back to DB query + type targetInfo struct { + id string + agentID string + } + var targets []targetInfo + + if len(cert.TargetIDs) > 0 { + // TargetIDs populated (e.g. from test or manual wiring) + for _, tid := range cert.TargetIDs { + ti := targetInfo{id: tid} + if s.targetRepo != nil { + if target, err := s.targetRepo.Get(ctx, tid); err == nil && target.AgentID != "" { + ti.agentID = target.AgentID + } + } + targets = append(targets, ti) + } + } else if s.targetRepo != nil { + // TargetIDs empty — query certificate_target_mappings via repository + dbTargets, err := s.targetRepo.ListByCertificate(ctx, cert.ID) + if err != nil { + slog.Error("failed to query targets for certificate", "cert_id", cert.ID, "error", err) + return + } + for _, t := range dbTargets { + targets = append(targets, targetInfo{id: t.ID, agentID: t.AgentID}) + } + } + + if len(targets) == 0 { + slog.Debug("no targets found for certificate, skipping deployment", "cert_id", cert.ID) return } - for _, targetID := range cert.TargetIDs { - tid := targetID - // Resolve agent_id from target for job routing + for _, t := range targets { + tid := t.id var agentIDPtr *string - if s.targetRepo != nil { - target, err := s.targetRepo.Get(ctx, tid) - if err != nil { - slog.Warn("failed to resolve agent for deployment job", "target_id", tid, "error", err) - } else if target.AgentID != "" { - agentID := target.AgentID - agentIDPtr = &agentID - } + if t.agentID != "" { + aid := t.agentID + agentIDPtr = &aid } deployJob := &domain.Job{ @@ -667,7 +694,9 @@ func (s *RenewalService) createDeploymentJobs(ctx context.Context, cert *domain. CreatedAt: time.Now(), } if err := s.jobRepo.Create(ctx, deployJob); err != nil { - slog.Error("failed to create deployment job for target", "target_id", targetID, "error", err) + slog.Error("failed to create deployment job for target", "target_id", tid, "cert_id", cert.ID, "error", err) + } else { + slog.Info("created deployment job", "job_id", deployJob.ID, "cert_id", cert.ID, "target_id", tid, "agent_id", t.agentID) } } } diff --git a/migrations/seed_test.sql b/migrations/seed_test.sql new file mode 100644 index 0000000..a6dac4b --- /dev/null +++ b/migrations/seed_test.sql @@ -0,0 +1,130 @@ +-- ============================================================================= +-- certctl Test Environment — Seed Data +-- ============================================================================= +-- +-- Pre-populates the database with the minimum objects needed to test the full +-- certificate lifecycle against real CA backends (Pebble, step-ca, Local CA). +-- +-- Load order (handled by Docker entrypoint filename sorting): +-- 001_schema.sql → ... → 008_verification.sql → 010_seed.sql → 015_seed_test.sql +-- +-- All IDs use a "test-" prefix so they're easy to spot in the dashboard. +-- ============================================================================= + +-- --------------------------------------------------------------------------- +-- Team +-- --------------------------------------------------------------------------- +INSERT INTO teams (id, name, description) +VALUES ( + 'team-test-ops', + 'Test Operations', + 'Operations team for certctl testing environment' +) ON CONFLICT (id) DO NOTHING; + +-- --------------------------------------------------------------------------- +-- Owner (references team) +-- --------------------------------------------------------------------------- +INSERT INTO owners (id, name, email, team_id) +VALUES ( + 'owner-test-admin', + 'Test Admin', + 'admin@certctl-test.local', + 'team-test-ops' +) ON CONFLICT (id) DO NOTHING; + +-- --------------------------------------------------------------------------- +-- Agent — must exist before the agent binary sends its first heartbeat +-- --------------------------------------------------------------------------- +-- The agent binary (certctl-agent container) connects with: +-- CERTCTL_AGENT_ID=agent-test-01 +-- CERTCTL_AGENT_NAME=test-agent-01 +-- The heartbeat handler does a GET by ID — if the agent doesn't exist, it 404s. +-- api_key_hash is SHA-256 of "test-agent-key-2026" (not used for auth, just stored). +INSERT INTO agents (id, name, hostname, status, registered_at, api_key_hash, os, architecture, ip_address, version) +VALUES ( + 'agent-test-01', + 'test-agent-01', + 'certctl-test-agent', + 'online', + NOW(), + 'cad819dee454889f686d678f691e5084e58ba149762eae2fda4d0bd2abaceefa', + 'linux', + 'amd64', + '10.30.50.8', + 'test' +) ON CONFLICT (id) DO NOTHING; + +-- The network scanner uses "server-scanner" as a virtual agent. +-- It gets auto-created by the server code, but seed it here to avoid races. +INSERT INTO agents (id, name, hostname, status, registered_at, api_key_hash) +VALUES ( + 'server-scanner', + 'server-scanner', + 'certctl-server', + 'online', + NOW(), + 'no-key' +) ON CONFLICT (id) DO NOTHING; + +-- --------------------------------------------------------------------------- +-- Issuers — one row per CA backend in the test environment +-- --------------------------------------------------------------------------- +-- These are metadata records the dashboard reads. The actual CA connections +-- are configured via env vars on the server container. + +-- Local CA (self-signed, always available) +INSERT INTO issuers (id, name, type, config, enabled) +VALUES ( + 'iss-local', + 'Local CA (Self-Signed)', + 'local', + '{"mode": "self-signed", "description": "Built-in self-signed CA for testing"}'::jsonb, + true +) ON CONFLICT (id) DO NOTHING; + +-- ACME via Pebble (simulates Let''s Encrypt) +INSERT INTO issuers (id, name, type, config, enabled) +VALUES ( + 'iss-acme-staging', + 'ACME (Pebble Test CA)', + 'acme', + '{"directory_url": "https://pebble:14000/dir", "email": "test@certctl.dev", "challenge_type": "http-01", "description": "Pebble ACME test server simulating Lets Encrypt"}'::jsonb, + true +) ON CONFLICT (id) DO NOTHING; + +-- step-ca (Smallstep private CA) +INSERT INTO issuers (id, name, type, config, enabled) +VALUES ( + 'iss-stepca', + 'step-ca (Private CA)', + 'stepca', + '{"url": "https://step-ca:9000", "provisioner": "admin", "description": "Smallstep private CA with JWK provisioner"}'::jsonb, + true +) ON CONFLICT (id) DO NOTHING; + +-- --------------------------------------------------------------------------- +-- Certificate Profile — TLS server certs, 90-day max +-- --------------------------------------------------------------------------- +INSERT INTO certificate_profiles (id, name, description, max_ttl_seconds, allowed_ekus, allowed_key_algorithms) +VALUES ( + 'prof-test-tls', + 'Test TLS Server', + 'Standard TLS server certificate profile for testing', + 7776000, -- 90 days + '["serverAuth"]'::jsonb, + '[{"algorithm": "ECDSA", "min_size": 256}, {"algorithm": "RSA", "min_size": 2048}]'::jsonb +) ON CONFLICT (id) DO NOTHING; + +-- --------------------------------------------------------------------------- +-- Deployment Target — NGINX (references agent-test-01) +-- --------------------------------------------------------------------------- +-- The agent deploys certs to NGINX via the shared nginx_certs volume. +INSERT INTO deployment_targets (id, name, type, agent_id, config, enabled) +VALUES ( + 'target-test-nginx', + 'Test NGINX', + 'NGINX', + 'agent-test-01', + '{"cert_path": "/nginx-certs/cert.pem", "key_path": "/nginx-certs/key.pem", "chain_path": "/nginx-certs/chain.pem", "reload_command": "true", "validate_command": "true"}'::jsonb, + true +) ON CONFLICT (id) DO NOTHING;