From ba1f7ff866295b257e1f9748af3a44bd5e1cf474 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Sun, 10 May 2026 00:51:25 +0000 Subject: [PATCH] auth-bundle-1 fix: bundled certctl-agent restart loop (latent since 2026-03-14) The bundled `docker-compose.yml` started the `certctl-agent` service without setting `CERTCTL_AGENT_ID`. `cmd/agent/main.go:1297-1300` fails fast on missing AGENT_ID with "Error: -agent-id flag or CERTCTL_AGENT_ID env var is required", which sends the container into a silent restart loop on every fresh `docker compose up`. Latent since commit d395776 (2026-03-14), which added the env-var contract on the agent side but never wired a pre-seeded matching row + env injection on the compose side. The integration test compose (`docker-compose.test.yml`) does set CERTCTL_AGENT_ID + seed agent-test-01 via seed_test.sql, which is why CI didn't surface the bug. Caught when an external operator first cloned dev/auth-bundle-1 to test Bundle 1. Closure mirrors the integration-test pattern: * migrations/seed_demo.sql pre-seeds an `agent-demo-1` row alongside the existing server-scanner sentinel. ON CONFLICT (id) DO NOTHING preserves idempotency. api_key_hash is a no-auth placeholder since demo runs with CERTCTL_AUTH_TYPE=none (synthetic actor-demo-anon covers every request). * deploy/docker-compose.yml certctl-server: add CERTCTL_DEMO_SEED=true so the demo seed (which holds the agent-demo-1 row + the rest of the demo fixtures) actually runs in the bundled compose. The compose is already a demo posture (CERTCTL_AUTH_TYPE=none + CERTCTL_KEYGEN_MODE=server), so this is consistent. docker-compose.demo.yml still works (it sets the same flag) and stays for backward compat. * deploy/docker-compose.yml certctl-agent: set CERTCTL_AGENT_ID=agent-demo-1 (overridable via env) so the agent finds its row on first heartbeat. * Makefile qa-stats: agents-table count bumped 12 -> 13. Production deploys are unaffected: they override CERTCTL_AUTH_TYPE, CERTCTL_KEYGEN_MODE, CERTCTL_DEMO_SEED, and CERTCTL_AGENT_ID with their own compose. The agent is registered via POST /api/v1/agents and the returned ID is plugged into CERTCTL_AGENT_ID per docs/operator/installation.md. Verified path: `docker compose -f deploy/docker-compose.yml up --build` boots green; certctl-agent reaches Online state on the first heartbeat; `curl --cacert ... https://localhost:8443/api/v1/agents` returns agent-demo-1 with status Online instead of an empty list. --- Makefile | 2 +- deploy/docker-compose.yml | 20 ++++++++++++++++++++ migrations/seed_demo.sql | 16 ++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 76d9ac0..cf61d5d 100644 --- a/Makefile +++ b/Makefile @@ -285,7 +285,7 @@ qa-stats: @echo "t.Skip sites: $$(grep -rE 't\.Skip(Now|f)?\(' --include='*_test.go' . 2>/dev/null | wc -l | tr -d ' ')" @echo "qa_test.go Part_ subtests: $$(grep -cE 't\.Run\(\"Part[0-9]+_' deploy/test/qa_test.go 2>/dev/null || echo 0)" @echo "Seed unique mc-* IDs: $$(grep -oE "mc-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ')" - @echo "Seed unique ag-* IDs: $$(grep -oE "ag-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ') (incl. agent_groups; agents-table count is 12)" + @echo "Seed unique ag-* IDs: $$(grep -oE "ag-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ') (incl. agent_groups; agents-table count is 13 incl. agent-demo-1 + 3 cloud sentinels + server-scanner)" @echo "Seed unique iss-* IDs: $$(grep -oE "iss-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ') (issuers table count is 13)" @echo "Seed unique tgt-* IDs: $$(grep -oE "tgt-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ')" @echo "Seed unique nst-* IDs: $$(grep -oE "nst-[a-z0-9_-]+" migrations/seed_demo.sql 2>/dev/null | sort -u | wc -l | tr -d ' ')" diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 9f007bb..61771cc 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -133,6 +133,15 @@ services: CERTCTL_KEYGEN_MODE: server # Demo uses server-side keygen; production should use "agent" CERTCTL_NETWORK_SCAN_ENABLED: "true" # Enable network scan GUI with seeded demo targets CERTCTL_CONFIG_ENCRYPTION_KEY: ${CERTCTL_CONFIG_ENCRYPTION_KEY:-change-me-32-char-encryption-key} # AES-256-GCM for dynamic issuer/target config + # Bundle 1 follow-on: this compose IS the bundled demo path + # (CERTCTL_AUTH_TYPE=none + KEYGEN_MODE=server above), so the + # demo seed runs by default. seed_demo.sql pre-seeds the + # agent-demo-1 row that the bundled certctl-agent below needs + # to authenticate. The docker-compose.demo.yml overlay still + # works (it sets the same flag) and remains for backward + # compat. Production deploys override CERTCTL_AUTH_TYPE + + # KEYGEN_MODE + DEMO_SEED via their own compose. + CERTCTL_DEMO_SEED: "true" ports: - "8443:8443" volumes: @@ -183,6 +192,17 @@ services: CERTCTL_SERVER_URL: https://certctl-server:8443 CERTCTL_SERVER_CA_BUNDLE_PATH: /etc/certctl/tls/ca.crt CERTCTL_API_KEY: ${CERTCTL_API_KEY:-change-me-in-production} + # Bundle 1 follow-on: pre-Bundle-1 the bundled agent had no + # CERTCTL_AGENT_ID set, hit cmd/agent/main.go's fail-fast guard + # ("agent-id flag or CERTCTL_AGENT_ID env var is required"), and + # restart-looped silently on every fresh `docker compose up`. + # Latent since 2026-03-14 (commit d395776). seed_demo.sql now + # pre-seeds the matching agents row; the demo runs with + # CERTCTL_AUTH_TYPE=none on the server so the api_key Bearer + # token is irrelevant here. Production deploys override + # CERTCTL_AGENT_ID with the value returned from + # POST /api/v1/agents during registration. + CERTCTL_AGENT_ID: ${CERTCTL_AGENT_ID:-agent-demo-1} CERTCTL_AGENT_NAME: docker-agent CERTCTL_LOG_LEVEL: info CERTCTL_DISCOVERY_DIRS: /var/lib/certctl/keys # Agent scans this directory for existing certificates diff --git a/migrations/seed_demo.sql b/migrations/seed_demo.sql index d137f0b..5aca27d 100644 --- a/migrations/seed_demo.sql +++ b/migrations/seed_demo.sql @@ -73,6 +73,22 @@ INSERT INTO agents (id, name, hostname, status, last_heartbeat_at, registered_at ('server-scanner', 'Network Scanner (Server-Side)', 'certctl-server', 'Online', NOW(), NOW() - INTERVAL '90 days', 'sentinel_no_auth', 'linux', 'amd64', '127.0.0.1', '2.0.14') ON CONFLICT (id) DO NOTHING; +-- Bundled docker-compose agent. Pre-Bundle-1 the bundled `certctl-agent` +-- service hit a fail-fast path on startup ("agent-id flag or +-- CERTCTL_AGENT_ID env var is required") because no row was pre-seeded +-- and no auto-register was wired; the container restart-looped silently +-- on every fresh `docker compose up`. Latent since 2026-03-14 +-- (commit d395776 added the env var but no seed). Bundle 1 closes the +-- loop: seed_demo.sql pre-seeds this row, docker-compose.yml's agent +-- service sets CERTCTL_AGENT_ID=agent-demo-1 + CERTCTL_DEMO_SEED=true +-- on the server. api_key_hash is opaque since the demo runs with +-- CERTCTL_AUTH_TYPE=none (synthetic actor-demo-anon covers every +-- request); production deploys override both env vars + use the +-- regular registration flow. +INSERT INTO agents (id, name, hostname, status, last_heartbeat_at, registered_at, api_key_hash, os, architecture, ip_address, version) VALUES + ('agent-demo-1', 'docker-agent', 'certctl-agent', 'Online', NOW(), NOW(), 'demo_no_auth', 'linux', 'amd64', '127.0.0.1', '2.1.0') +ON CONFLICT (id) DO NOTHING; + -- Sentinel agents for cloud discovery sources (M50) INSERT INTO agents (id, name, hostname, status, last_heartbeat_at, registered_at, api_key_hash, os, architecture, ip_address, version) VALUES ('cloud-aws-sm', 'AWS Secrets Manager Discovery', 'certctl-server', 'Online', NOW(), NOW() - INTERVAL '90 days', 'sentinel_no_auth', 'linux', 'amd64', '127.0.0.1', '2.1.0'),