From aedf19d12890346c8ca244e1d2bef42b51c21543 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 12 May 2026 14:22:19 +0000 Subject: [PATCH] ci(cold-db-smoke): inline into workflow; remove the script (operator: not a per-commit gate) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operator pushback: 'I don't want a smoke test I have to manually run every time I commit.' Correct read — the script existed for local debugging but its presence in scripts/ci-guards/ implied 'operator runs this regularly,' which is the opposite of the design intent. Changes: - Removed scripts/ci-guards/cold-db-compose-smoke.sh. - Inlined the smoke logic directly into the cold-db-compose-smoke job in .github/workflows/ci.yml. Same semantics: docker compose down -v -> up -d -> wait-healthy -> bootstrap admin -> issue/renew/revoke -> assert audit rows -> teardown. 15-min wall-clock cap. Logs dump on failure. - Removed the cold-db-compose-smoke.sh skip case from the generic regression-guards loop (no longer needed). - Updated scripts/ci-guards/README.md and docs/contributor/ci-guards.md to reflect the new shape: 'lives in the workflow, not as a script.' Workspace docs updated (cowork/WORKSPACE-CHANGELOG.md, cowork/CLAUDE.md, cowork/auditable-codebase-bundle/RESULTS.md). The gate is unchanged: CI runs the smoke on every push, master branch-protection enforces it as a required check. Operator's manual action is once — adding the check to branch-protection. Audit-Closes: post-v2.1.0-anti-rot/item-6 --- .github/workflows/ci.yml | 111 +++++++++++-- docs/contributor/ci-guards.md | 4 +- scripts/ci-guards/README.md | 3 +- scripts/ci-guards/cold-db-compose-smoke.sh | 180 --------------------- 4 files changed, 102 insertions(+), 196 deletions(-) delete mode 100755 scripts/ci-guards/cold-db-compose-smoke.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff7ebc1..c6178d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -210,22 +210,10 @@ jobs: # Contract: each guard MUST exit 0 on clean repo, non-zero with # ::error:: prefix on regression. See scripts/ci-guards/README.md. # - # SKIP cold-db-compose-smoke.sh — it needs Docker + a fresh - # postgres volume, which only exists in the dedicated - # `cold-db-compose-smoke` job below. Including it in this loop - # would always fail (no Docker on the runners that don't bring - # up compose). run: | set -e fail=0 for g in scripts/ci-guards/*.sh; do - case "$(basename "$g")" in - cold-db-compose-smoke.sh) - echo "::group::$(basename "$g") (skipped — runs in dedicated job)" - echo "::endgroup::" - continue - ;; - esac echo "::group::$(basename "$g")" if ! bash "$g"; then fail=1 @@ -258,13 +246,108 @@ jobs: # 15-min wall-clock cap covers cold image pull + compose-up + # full issue/renew/revoke probe + teardown. Increase only if # the underlying steps legitimately grow. + # + # The smoke is inlined here on purpose — it is NOT a script in + # scripts/ci-guards/, because there is no value in a developer + # running this locally. The whole point of the gate is that CI + # owns the cold-DB state; the operator never has to remember to + # run it. Master branch-protection enforces this job as a + # required check; that is the manual action, and it happens + # once. timeout-minutes: 15 - run: bash scripts/ci-guards/cold-db-compose-smoke.sh + working-directory: deploy + env: + STARTUP_TIMEOUT_SECONDS: 300 + run: | + set -e + set -o pipefail + + SERVER_URL="https://localhost:8443" + CACERT_PATH="${GITHUB_WORKSPACE}/deploy/test/certs/ca.crt" + + log() { echo "[cold-db-smoke] $*"; } + + wait_for_service_healthy() { + local svc="$1" deadline=$(( $(date +%s) + STARTUP_TIMEOUT_SECONDS )) + while [ "$(date +%s)" -lt "$deadline" ]; do + local state + state="$(docker compose ps --format json "$svc" 2>/dev/null | python3 -c ' + import json, sys + try: + line = sys.stdin.read().strip() + if not line: + print("not-up"); sys.exit(0) + rows = json.loads(line) if line.startswith("[") else [json.loads(l) for l in line.splitlines() if l.strip()] + if not rows: + print("not-up") + else: + print(rows[0].get("Health", rows[0].get("State", "?"))) + except Exception as e: + print(f"err: {e}") + ')" + if [ "$state" = "healthy" ] || [ "$state" = "running" ]; then + log " $svc → $state"; return 0 + fi + sleep 2 + done + log " $svc did NOT reach healthy within ${STARTUP_TIMEOUT_SECONDS}s (last: $state)" + return 1 + } + + http_call() { + local method="$1" path="$2" data="${3:-}" + local args=(--silent --show-error --max-time 30 -X "$method" "$SERVER_URL$path") + [ -f "$CACERT_PATH" ] && args+=(--cacert "$CACERT_PATH") || args+=(--insecure) + [ -n "${KEY:-}" ] && args+=(-H "Authorization: Bearer $KEY") + [ -n "$data" ] && args+=(-H "Content-Type: application/json" -d "$data") + curl "${args[@]}" + } + + log "1/7 down -v --remove-orphans" + docker compose down -v --remove-orphans 2>&1 | tail -3 || true + + log "2/7 up -d (cold boot)" + docker compose up -d 2>&1 | tail -3 + + log "3/7 wait for healthchecks" + wait_for_service_healthy postgres + wait_for_service_healthy certctl-server + wait_for_service_healthy certctl-agent || log " (agent skipped — non-demo compose)" + + log "4/7 minting day-0 admin" + TOKEN="$(openssl rand -base64 32 | tr -d '\n')" + echo "CERTCTL_BOOTSTRAP_TOKEN=$TOKEN" > /tmp/_smoke.env + docker compose --env-file /tmp/_smoke.env up -d --force-recreate certctl-server 2>&1 | tail -2 + sleep 5 + wait_for_service_healthy certctl-server + BODY="$(http_call POST /api/v1/auth/bootstrap "{\"token\":\"$TOKEN\",\"actor_name\":\"smoke-admin\"}")" + KEY="$(echo "$BODY" | python3 -c 'import json,sys; print(json.load(sys.stdin)["key_value"])')" + [ -n "$KEY" ] || { log "bootstrap failed: $BODY"; exit 1; } + + log "5/7 issuing test cert" + ISSUE='{"common_name":"smoke-test.local","profile_id":"profile-default","environment":"test","owner_id":"o-platform"}' + R="$(http_call POST /api/v1/certificates "$ISSUE")" + CID="$(echo "$R" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("id") or d.get("certificate",{}).get("id",""))')" + [ -n "$CID" ] || { log "issue failed: $R"; exit 1; } + + log "6/7 renewing $CID" + http_call POST "/api/v1/certificates/$CID/renew" >/dev/null + + log "7/7 revoking + asserting audit rows" + http_call POST "/api/v1/certificates/$CID/revoke" '{"reason":"smoke-test"}' >/dev/null + AUD="$(http_call GET '/api/v1/audit?limit=50')" + for action in cert.issued cert.renewed cert.revoked; do + if ! echo "$AUD" | python3 -c "import json,sys; d=json.load(sys.stdin); evs=d.get('events') or d.get('audit',{}).get('events') or []; sys.exit(0 if any(e.get('action')=='$action' for e in evs) else 1)"; then + log "MISSING audit row: $action"; echo "$AUD" | head -200; exit 1 + fi + done + log "PASS — tearing down" + docker compose down -v 2>&1 | tail -2 - name: Dump compose logs on failure if: failure() + working-directory: deploy run: | - cd deploy for svc in postgres certctl-server certctl-agent certctl-tls-init; do echo "==== $svc ====" docker compose logs --no-color --tail 200 "$svc" || true diff --git a/docs/contributor/ci-guards.md b/docs/contributor/ci-guards.md index 92c3dcc..99aabd2 100644 --- a/docs/contributor/ci-guards.md +++ b/docs/contributor/ci-guards.md @@ -32,7 +32,9 @@ Catch drift across the four product surfaces — OpenAPI spec, HTTP router, MCP ### Operational guards -`cold-db-compose-smoke.sh` (wipe postgres volume, bring stack up cold, issue/renew/revoke, audit-row check). `doc-rot-detector.sh` (every doc reviewed within 120 days). These pin the operational reality, not the source shape. +`doc-rot-detector.sh` (every doc reviewed within 120 days) pins the operational reality, not the source shape. + +The cold-DB compose smoke (wipe postgres volume, bring stack up cold, issue/renew/revoke, audit-row check) lives directly in `.github/workflows/ci.yml::cold-db-compose-smoke` — not as a script. It is intentionally not operator-runnable: the gate's value is that CI owns the cold-DB state, the operator never has to remember to run it. Master branch-protection enforces the job as a required check; that is the manual action, and it happens once. ## When the build is red diff --git a/scripts/ci-guards/README.md b/scripts/ci-guards/README.md index ed22e28..0494a79 100644 --- a/scripts/ci-guards/README.md +++ b/scripts/ci-guards/README.md @@ -92,7 +92,8 @@ These guards catch defect classes BEFORE they get audit findings — they pin in |---|---|---| | `complete-path-config-coverage` | post-v2.1.0 / item-1 | "Lying field" — `CERTCTL_*` env var defined in `internal/config/config.go` that no consumer outside `internal/config/` actually reads. Operator-facing config that the docs claim works but the code never honors. Companion Go test at `internal/config/coverage_test.go`. | | `doc-rot-detector` | post-v2.1.0 / item-5 | Docs older than 90 days warn (yellow), older than 120 days fail (red). Uses HEAD commit timestamp for reproducibility. `docs/archive/` allowlisted in bulk. | -| `cold-db-compose-smoke` | post-v2.1.0 / item-6 | Migration-on-cold-DB regression (canonical: 2026-05-09 migration 000045 broken INSERT, commit `6444e13`). Wipes the postgres volume, brings the stack up cold, issue/renew/revoke + 3 audit rows. **Runs in its own GitHub Actions job** (`cold-db-compose-smoke`), NOT the generic regression-guards loop — needs Docker. | + +The cold-DB compose smoke (post-v2.1.0 / item-6) is NOT a script in this directory — it is inlined directly into `.github/workflows/ci.yml::cold-db-compose-smoke` because there is no value in a developer running it locally (the whole point of the gate is that CI owns the cold-DB state). To inspect or modify the smoke logic, read that workflow job; there is intentionally no `scripts/ci-guards/cold-db-compose-smoke.sh`. The fourth Bundle artifact (`internal/ciparity/`) is Go tests, not shell guards — runs under the standard Go test step. Pins the MCP tool catalogue floor + naming convention; reports CLI/MCP/OpenAPI surface counts as a trend metric. diff --git a/scripts/ci-guards/cold-db-compose-smoke.sh b/scripts/ci-guards/cold-db-compose-smoke.sh deleted file mode 100755 index 7efa89c..0000000 --- a/scripts/ci-guards/cold-db-compose-smoke.sh +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env bash -# scripts/ci-guards/cold-db-compose-smoke.sh -# -# Per post-v2.1.0 anti-rot item 6 (Auditable Codebase Bundle). -# -# The bug class this catches: a migration whose .up.sql is broken in a -# way the unit tests / integration suite misses because they reuse a -# warm DB across runs. The canonical case: 2026-05-09 migration -# 000045's broken INSERT, surfaced only by a cold `docker compose up` -# and fixed in commit 6444e13. This guard runs that very check on -# every push. -# -# Workflow: -# 1. docker compose down -v --remove-orphans (wipe volumes) -# 2. docker compose up -d (cold boot) -# 3. wait up to 5 min for healthchecks (postgres, -# certctl-server, -# certctl-agent) -# 4. mint a day-0 admin via /api/v1/auth/bootstrap (Bundle 1 path) -# 5. issue + renew + revoke a certificate (HTTP API) -# 6. assert audit rows for each step -# 7. docker compose down -v (clean up) -# -# Total runtime: ~3-5 min on warm Docker, ~5-10 min cold. -# -# Failure paths dump `docker compose logs` for every service to make -# CI failures actionable without a re-run. -# -# This script is invoked by .github/workflows/ci.yml::cold-db-compose-smoke. -# Runs locally for developers via `bash scripts/ci-guards/cold-db-compose-smoke.sh`. - -set -e -set -o pipefail - -REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" -cd "$REPO_ROOT/deploy" - -# Tunables (the CI job overrides these as needed). -STARTUP_TIMEOUT_SECONDS="${COLD_DB_SMOKE_STARTUP_TIMEOUT:-300}" # 5 min -PROBE_TIMEOUT_SECONDS="${COLD_DB_SMOKE_PROBE_TIMEOUT:-180}" # 3 min -SERVER_URL="${COLD_DB_SMOKE_SERVER_URL:-https://localhost:8443}" -CACERT_PATH="${COLD_DB_SMOKE_CACERT:-${REPO_ROOT}/deploy/test/certs/ca.crt}" - -# --- helpers ---------------------------------------------------------------- - -log() { echo "[cold-db-smoke] $*"; } - -dump_logs_on_failure() { - log "FAILURE — dumping service logs:" - docker compose ps || true - for svc in postgres certctl-server certctl-agent certctl-tls-init; do - echo - echo "==== $svc ====" - docker compose logs --no-color --tail 200 "$svc" 2>&1 || true - done -} - -trap 'dump_logs_on_failure' ERR - -wait_for_service_healthy() { - local svc="$1" deadline=$(( $(date +%s) + STARTUP_TIMEOUT_SECONDS )) - while [ "$(date +%s)" -lt "$deadline" ]; do - local state - state="$(docker compose ps --format json "$svc" 2>/dev/null | python3 -c ' -import json, sys -try: - line = sys.stdin.read().strip() - if not line: - print("not-up") - sys.exit(0) - # docker compose ps emits one JSON object per line (NDJSON) on newer - # versions; older versions emit a JSON array. Handle both. - if line.startswith("["): - rows = json.loads(line) - else: - rows = [json.loads(l) for l in line.splitlines() if l.strip()] - if not rows: - print("not-up") - else: - print(rows[0].get("Health", rows[0].get("State", "?"))) -except Exception as e: - print(f"err: {e}") -')" - if [ "$state" = "healthy" ] || [ "$state" = "running" ]; then - log " $svc → $state" - return 0 - fi - sleep 2 - done - log " $svc did NOT reach healthy within $STARTUP_TIMEOUT_SECONDS s (last state: $state)" - return 1 -} - -http_call() { - # http_call [data_json] - local method="$1" path="$2" data="${3:-}" - local args=(--silent --show-error --max-time 30 -X "$method" "$SERVER_URL$path") - if [ -f "$CACERT_PATH" ]; then - args+=(--cacert "$CACERT_PATH") - else - args+=(--insecure) - fi - if [ -n "${KEY:-}" ]; then - args+=(-H "Authorization: Bearer $KEY") - fi - if [ -n "$data" ]; then - args+=(-H "Content-Type: application/json" -d "$data") - fi - curl "${args[@]}" -} - -# --- the smoke --------------------------------------------------------------- - -log "1/7 down -v --remove-orphans (wiping postgres volume)" -docker compose down -v --remove-orphans 2>&1 | tail -3 || true - -log "2/7 up -d (cold boot)" -docker compose up -d 2>&1 | tail -3 - -log "3/7 waiting for healthchecks (timeout ${STARTUP_TIMEOUT_SECONDS}s/svc)" -wait_for_service_healthy postgres -wait_for_service_healthy certctl-server -# certctl-agent depends on the demo seed having run; only wait when -# CERTCTL_DEMO_SEED=true is in effect (the bundled clean compose -# doesn't always seed). Best-effort. -wait_for_service_healthy certctl-agent || log " (agent healthcheck skipped — non-demo compose)" - -log "4/7 minting day-0 admin via /api/v1/auth/bootstrap" -TOKEN="$(openssl rand -base64 32 | tr -d '\n')" -# Restart the server with the bootstrap token so the strategy is -# active. Compose --env-file is the lightest path. -echo "CERTCTL_BOOTSTRAP_TOKEN=$TOKEN" > /tmp/_smoke.env -docker compose --env-file /tmp/_smoke.env up -d --force-recreate certctl-server 2>&1 | tail -2 -sleep 5 -wait_for_service_healthy certctl-server - -BOOTSTRAP_BODY="$(http_call POST /api/v1/auth/bootstrap "{\"token\":\"$TOKEN\",\"actor_name\":\"smoke-admin\"}")" -KEY="$(echo "$BOOTSTRAP_BODY" | python3 -c 'import json,sys; print(json.load(sys.stdin)["key_value"])')" -if [ -z "$KEY" ]; then - log " bootstrap did NOT return a key_value — body was:" - echo "$BOOTSTRAP_BODY" - exit 1 -fi -log " admin minted (actor=smoke-admin)" - -log "5/7 issuing a test certificate" -# Use the default profile + demo CA. The exact shape may need a tweak -# depending on the compose's seeded issuers — fail loudly with the -# response body if the API rejects the request. -ISSUE_BODY='{"common_name":"smoke-test.local","profile_id":"profile-default","environment":"test","owner_id":"o-platform"}' -ISSUE_RESP="$(http_call POST /api/v1/certificates "$ISSUE_BODY")" -CERT_ID="$(echo "$ISSUE_RESP" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("id") or d.get("certificate",{}).get("id",""))')" -if [ -z "$CERT_ID" ]; then - log " issue failed; response body:" - echo "$ISSUE_RESP" - exit 1 -fi -log " cert issued: $CERT_ID" - -log "6/7 renewing the certificate" -http_call POST "/api/v1/certificates/$CERT_ID/renew" >/dev/null -log " renewed" - -log "7/7 revoking the certificate + asserting audit rows" -http_call POST "/api/v1/certificates/$CERT_ID/revoke" '{"reason":"smoke-test"}' >/dev/null -AUDIT_BODY="$(http_call GET "/api/v1/audit?limit=50")" -for action in cert.issued cert.renewed cert.revoked; do - if ! echo "$AUDIT_BODY" | python3 -c "import json,sys; d=json.load(sys.stdin); evs=d.get('events') or d.get('audit',{}).get('events') or []; sys.exit(0 if any(e.get('action')=='$action' for e in evs) else 1)"; then - log " MISSING audit row: $action" - echo "$AUDIT_BODY" | head -200 - exit 1 - fi -done -log " audit rows present: cert.issued, cert.renewed, cert.revoked" - -log "DONE — tearing down" -trap - ERR -docker compose down -v 2>&1 | tail -2 -rm -f /tmp/_smoke.env -log "PASS"