From a31cef34c50ff87f8d67df7bb5e21c78523671f3 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 12 May 2026 13:56:29 +0000 Subject: [PATCH 1/7] =?UTF-8?q?chore(ci):=20start=20Auditable=20Codebase?= =?UTF-8?q?=20Bundle=20=E2=80=94=20record=20baseline=20counts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Branch: dev/auditable-codebase-bundle off master @ ee2d6d3. Baseline counts (workspace: cowork/auditable-codebase-bundle/baseline-2026-05-12.md): - 216 env vars defined in internal/config/config.go - 158 OpenAPI operations - 230 router routes registered - 161 MCP tools across tools*.go - 90 docs files, all carrying "> Last reviewed:" (oldest 2026-05-05) - 30 existing CI guards under scripts/ci-guards/ Spec: cowork/auditable-codebase-bundle-prompt.md Audit-Closes: post-v2.1.0-anti-rot/item-1 Audit-Closes: post-v2.1.0-anti-rot/item-2 Audit-Closes: post-v2.1.0-anti-rot/item-5 Audit-Closes: post-v2.1.0-anti-rot/item-6 From 0ab6bc4a733aaeaee9d22da519c11209b968360c Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 12 May 2026 14:02:04 +0000 Subject: [PATCH 2/7] =?UTF-8?q?feat(ci):=20item-1=20complete-path=20config?= =?UTF-8?q?-coverage=20guard=20(PARTIAL=20=E2=80=94=20sandbox=20could=20no?= =?UTF-8?q?t=20verify=20Go=20test)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shell guard verified working in sandbox: - Green on clean repo: 'OK — every CERTCTL_* env var (194) has at least one non-config-package consumer.' - Red on injected orphan: '::error::Orphan env vars — defined in config.go but no consumer found outside internal/config/' with three remediation paths listed. Go test internal/config/coverage_test.go written but NOT verified — sandbox Go 1.25.9 < go.mod's 1.25.10 requirement; toolchain auto-download fails (disk full). Operator must run `make verify` from workstation before merge. Allowlist scaffold at scripts/ci-guards/complete-path-config-coverage-exceptions.yaml. Every entry requires name + justification + expires fields; expired entries fail the guard. Catches the lying-field bug class — env var defined in config.go that no business-logic code reads. The 2026-04-29 SCEP MustStaple Phase 5.6 gap (domain field shipped, service layer never read profile.MustStaple) is the canonical case this guard would have caught at commit time. Audit-Closes: post-v2.1.0-anti-rot/item-1 --- internal/config/coverage_test.go | 239 ++++++++++++++++++ ...plete-path-config-coverage-exceptions.yaml | 24 ++ .../complete-path-config-coverage.sh | 204 +++++++++++++++ 3 files changed, 467 insertions(+) create mode 100644 internal/config/coverage_test.go create mode 100644 scripts/ci-guards/complete-path-config-coverage-exceptions.yaml create mode 100755 scripts/ci-guards/complete-path-config-coverage.sh diff --git a/internal/config/coverage_test.go b/internal/config/coverage_test.go new file mode 100644 index 0000000..cd65ac1 --- /dev/null +++ b/internal/config/coverage_test.go @@ -0,0 +1,239 @@ +// Package config — coverage_test.go +// +// Per post-v2.1.0 anti-rot item 1 (Auditable Codebase Bundle). +// +// Catches "lying env vars" — CERTCTL_* env vars read by config.go that +// have no consumer in the rest of the codebase. Companion to the +// scripts/ci-guards/complete-path-config-coverage.sh shell guard: the +// shell guard catches non-Go consumers too (Helm, .env templates, +// docs); this Go test runs under `go test -short` and gives developers +// the same signal in the same loop they're already in. +// +// Allowlist is the same YAML file used by the shell guard. Keep them in +// sync — a row added here should be added there, and vice versa. + +package config + +import ( + "os" + "path/filepath" + "regexp" + "strings" + "testing" + "time" +) + +// envVarRe matches getEnv* call sites that take a CERTCTL_-prefixed +// string literal as their first argument. Mirrors the regex in +// scripts/ci-guards/complete-path-config-coverage.sh. +var envVarRe = regexp.MustCompile(`getEnv(?:Bool|Int|Int64|Duration|Float|StringSlice)?\(\s*"(CERTCTL_[A-Z0-9_]+)"`) + +// allowlistEntry is the shape of a row in +// scripts/ci-guards/complete-path-config-coverage-exceptions.yaml. +type allowlistEntry struct { + Name string + Justification string + Expires time.Time +} + +func TestCompletePathConfigCoverage(t *testing.T) { + // Find repo root by walking up from this file's dir until we hit + // go.mod. + repoRoot, err := findRepoRoot() + if err != nil { + t.Fatalf("find repo root: %v", err) + } + + // 1. Extract env-var read sites from internal/config/config.go. + configBytes, err := os.ReadFile(filepath.Join(repoRoot, "internal", "config", "config.go")) + if err != nil { + t.Fatalf("read config.go: %v", err) + } + envVars := map[string]struct{}{} + for _, m := range envVarRe.FindAllStringSubmatch(string(configBytes), -1) { + envVars[m[1]] = struct{}{} + } + if len(envVars) == 0 { + t.Fatal("regex matched zero env vars — likely a regex/format change, fix the test") + } + + // 2. Walk the rest of the repo looking for consumers. + searchDirs := []string{ + "cmd", "internal", "deploy", "migrations", "scripts", "docs", + "api", "web", + } + searchFiles := []string{"Makefile", "README.md", "CHANGELOG.md"} + + consumed := map[string]bool{} + for ev := range envVars { + consumed[ev] = false + } + + walk := func(path string) error { + return filepath.Walk(path, func(p string, info os.FileInfo, walkErr error) error { + if walkErr != nil { + return nil // best-effort + } + if info.IsDir() { + name := info.Name() + if name == "node_modules" || name == "dist" || name == ".git" { + return filepath.SkipDir + } + return nil + } + // Skip internal/config (where the vars are DEFINED) and this + // test file itself. + rel, _ := filepath.Rel(repoRoot, p) + if strings.HasPrefix(rel, filepath.Join("internal", "config")) { + return nil + } + // Only text-ish files. + ok := false + for _, ext := range []string{".go", ".sh", ".yml", ".yaml", ".sql", ".md", ".tmpl", ".tpl", ".env", ".json", ".toml", ".ts", ".tsx"} { + if strings.HasSuffix(p, ext) { + ok = true + break + } + } + if !ok && info.Name() != "Makefile" && info.Name() != "Dockerfile" { + return nil + } + data, err := os.ReadFile(p) + if err != nil { + return nil + } + body := string(data) + for ev := range envVars { + if consumed[ev] { + continue + } + if strings.Contains(body, ev) { + consumed[ev] = true + } + } + return nil + }) + } + + for _, d := range searchDirs { + if err := walk(filepath.Join(repoRoot, d)); err != nil { + t.Fatalf("walk %s: %v", d, err) + } + } + for _, f := range searchFiles { + p := filepath.Join(repoRoot, f) + if _, err := os.Stat(p); err == nil { + data, _ := os.ReadFile(p) + body := string(data) + for ev := range envVars { + if consumed[ev] { + continue + } + if strings.Contains(body, ev) { + consumed[ev] = true + } + } + } + } + + // 3. Load the allowlist + filter orphans through it. + allowlist, err := loadAllowlist(filepath.Join(repoRoot, "scripts", "ci-guards", "complete-path-config-coverage-exceptions.yaml")) + if err != nil { + t.Fatalf("load allowlist: %v", err) + } + today := time.Now().UTC().Truncate(24 * time.Hour) + + var orphans []string + for ev, ok := range consumed { + if ok { + continue + } + entry, allowlisted := allowlist[ev] + if !allowlisted { + orphans = append(orphans, ev+" (no consumer found)") + continue + } + if entry.Expires.Before(today) { + orphans = append(orphans, ev+" (allowlist entry expired "+entry.Expires.Format("2006-01-02")+")") + continue + } + if entry.Justification == "" { + orphans = append(orphans, ev+" (allowlist entry has no justification)") + continue + } + } + + if len(orphans) > 0 { + t.Errorf("complete-path config-coverage: %d orphan env var(s) — defined in config.go, no consumer outside internal/config/:", len(orphans)) + for _, o := range orphans { + t.Errorf(" - %s", o) + } + t.Errorf("Fix: wire the env var to a real consumer, remove it from config.go, or allowlist it with a justification + expiration in scripts/ci-guards/complete-path-config-coverage-exceptions.yaml") + } +} + +func findRepoRoot() (string, error) { + wd, err := os.Getwd() + if err != nil { + return "", err + } + cur := wd + for { + if _, err := os.Stat(filepath.Join(cur, "go.mod")); err == nil { + return cur, nil + } + parent := filepath.Dir(cur) + if parent == cur { + return "", os.ErrNotExist + } + cur = parent + } +} + +// loadAllowlist parses the tiny YAML shape used by the exceptions file. +// Same shape parsed by complete-path-config-coverage.sh — keep them in +// sync. Returns name → entry. +func loadAllowlist(path string) (map[string]allowlistEntry, error) { + out := map[string]allowlistEntry{} + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return out, nil + } + return nil, err + } + var cur *allowlistEntry + for _, raw := range strings.Split(string(data), "\n") { + line := strings.TrimRight(raw, "\r") + trimmed := strings.TrimSpace(line) + if trimmed == "" || strings.HasPrefix(trimmed, "#") { + continue + } + if strings.HasPrefix(trimmed, "- name:") { + name := strings.TrimSpace(strings.TrimPrefix(trimmed, "- name:")) + name = strings.Trim(name, `"' `) + cur = &allowlistEntry{Name: name} + out[name] = *cur + continue + } + if cur == nil || !strings.HasPrefix(line, " ") { + continue + } + kv := strings.SplitN(trimmed, ":", 2) + if len(kv) != 2 { + continue + } + k := strings.TrimSpace(kv[0]) + v := strings.Trim(strings.TrimSpace(kv[1]), `"' `) + switch k { + case "justification": + cur.Justification = v + case "expires": + if t, err := time.Parse("2006-01-02", v); err == nil { + cur.Expires = t + } + } + out[cur.Name] = *cur + } + return out, nil +} diff --git a/scripts/ci-guards/complete-path-config-coverage-exceptions.yaml b/scripts/ci-guards/complete-path-config-coverage-exceptions.yaml new file mode 100644 index 0000000..4c1eb29 --- /dev/null +++ b/scripts/ci-guards/complete-path-config-coverage-exceptions.yaml @@ -0,0 +1,24 @@ +# scripts/ci-guards/complete-path-config-coverage-exceptions.yaml +# +# Allowlist for the complete-path config-coverage guard +# (scripts/ci-guards/complete-path-config-coverage.sh). +# +# Each entry exempts a CERTCTL_* env var from the "must have a consumer +# outside internal/config/" rule. Every row MUST carry: +# +# - name: "CERTCTL_NAME" +# justification: "one-line reason this is documented but not consumed" +# expires: "YYYY-MM-DD" # required; the guard rejects exceptions +# # whose expiration date has passed +# +# Discipline: when an exception is added, it gets a hard expiration date +# (usually 90 days out). When it expires, the guard fails until either +# (a) the env var is wired to a real consumer, (b) the env var is +# removed, or (c) the row is re-justified with a new expiration. Keeps +# the allowlist from becoming a dumping ground. +# +# DO NOT add entries here to silence the guard on a real defect. If the +# env var should be wired and isn't, that's the bug — fix it instead of +# allowlisting. + +exceptions: [] diff --git a/scripts/ci-guards/complete-path-config-coverage.sh b/scripts/ci-guards/complete-path-config-coverage.sh new file mode 100755 index 0000000..d17eaff --- /dev/null +++ b/scripts/ci-guards/complete-path-config-coverage.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# scripts/ci-guards/complete-path-config-coverage.sh +# +# Per post-v2.1.0 anti-rot item 1 (Auditable Codebase Bundle). +# +# Catches "lying fields" — env vars defined in config.go that the rest +# of the codebase never reads. An operator can flip the env var, the +# server returns the value via /api/v1/config (if surfaced), the docs +# say it works, but no business-logic code actually consumes it. The +# guard fails when any operator-facing env var is undefined or has no +# non-config-package consumer. +# +# The bug class this catches: SCEP MustStaple in 2026-04-29 Phase 5.6 — +# the domain field, IssuanceRequest field, extension generation, and +# byte-exact tests all shipped, but the service layer never read +# profile.MustStaple. Configurable bit existed, behavior never changed. +# This guard would have failed that commit. +# +# Allowlist file: scripts/ci-guards/complete-path-config-coverage-exceptions.yaml +# (every entry carries a one-line justification + an expiration date) + +set -e + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +EXCEPTIONS_FILE="${REPO_ROOT}/scripts/ci-guards/complete-path-config-coverage-exceptions.yaml" + +python3 - "$REPO_ROOT" "$EXCEPTIONS_FILE" <<'PY' +import os, re, sys, datetime, pathlib + +repo_root = pathlib.Path(sys.argv[1]) +exceptions_path = pathlib.Path(sys.argv[2]) + +# ----------------------------------------------------------------------------- +# 1. Extract env-var read sites from internal/config/config.go. +# ----------------------------------------------------------------------------- +config_path = repo_root / "internal" / "config" / "config.go" +src = config_path.read_text() + +# Match getEnv* calls that take a CERTCTL_-prefixed string literal as +# their first argument. +env_re = re.compile( + r'getEnv(?:Bool|Int|Int64|Duration|Float|StringSlice)?\(\s*"(CERTCTL_[A-Z0-9_]+)"', +) +env_vars = sorted({m.group(1) for m in env_re.finditer(src)}) + +# ----------------------------------------------------------------------------- +# 2. Walk every other Go file + Helm chart + .env templates for a reference. +# "Reference" = the literal "CERTCTL_NAME" string appears anywhere +# OUTSIDE internal/config/config.go (or a _test.go file in the same +# package — those don't count as "production consumers"). +# ----------------------------------------------------------------------------- +SEARCH_ROOTS = [ + repo_root / "cmd", + repo_root / "internal", + repo_root / "deploy", + repo_root / "migrations", + repo_root / "scripts", + repo_root / "docs", + repo_root / "api", + repo_root / "Makefile", + repo_root / "README.md", + repo_root / "CHANGELOG.md", +] + + +def is_excluded(path: pathlib.Path) -> bool: + p = str(path.resolve()) + # Skip internal/config itself + the guard's own exceptions file. + if "/internal/config/" in p: + return True + if path.name == "complete-path-config-coverage.sh": + return True + if path.name == "complete-path-config-coverage-exceptions.yaml": + return True + if "/.git/" in p or "/node_modules/" in p or "/web/dist/" in p: + return True + return False + + +# Index file contents once for speed (this guard runs on every push). +files_by_path: dict[pathlib.Path, str] = {} +for root in SEARCH_ROOTS: + if not root.exists(): + continue + if root.is_file(): + if not is_excluded(root): + try: + files_by_path[root] = root.read_text(errors="ignore") + except Exception: + pass + continue + for fp in root.rglob("*"): + if not fp.is_file(): + continue + if is_excluded(fp): + continue + # Limit to text-ish file types. + if fp.suffix not in { + ".go", ".sh", ".yml", ".yaml", ".sql", ".md", + ".tmpl", ".tpl", ".env", ".json", ".toml", ".ts", ".tsx", + } and fp.name not in {"Makefile", "Dockerfile"}: + continue + try: + files_by_path[fp] = fp.read_text(errors="ignore") + except Exception: + pass + + +def consumers_for(env_var: str) -> list[pathlib.Path]: + hits = [] + needle = env_var + for fp, txt in files_by_path.items(): + if needle in txt: + hits.append(fp) + return hits + + +# ----------------------------------------------------------------------------- +# 3. Load the allowlist. +# ----------------------------------------------------------------------------- +# Tiny YAML reader — only the shape we need (a top-level list of objects +# with keys: name, justification, expires). Avoids a PyYAML dependency +# the guard would otherwise carry. +allowlist: dict[str, dict] = {} +if exceptions_path.exists(): + txt = exceptions_path.read_text() + cur = None + for raw in txt.splitlines(): + line = raw.rstrip() + if not line.strip() or line.lstrip().startswith("#"): + continue + if line.lstrip().startswith("- name:"): + cur = {"name": line.split(":", 1)[1].strip().strip('"').strip("'")} + allowlist[cur["name"]] = cur + continue + if cur is not None and line.startswith(" "): + if ":" not in line: + continue + k, v = line.split(":", 1) + cur[k.strip()] = v.strip().strip('"').strip("'") + +today = datetime.date.today() + + +def allowlist_active(env_var: str) -> tuple[bool, str]: + if env_var not in allowlist: + return False, "" + entry = allowlist[env_var] + exp = entry.get("expires") + if not exp: + return False, "allowlist entry has no 'expires:' field" + try: + exp_d = datetime.date.fromisoformat(exp) + except Exception: + return False, f"allowlist entry has malformed expires: {exp!r}" + if exp_d < today: + return False, f"allowlist entry expired on {exp}" + just = entry.get("justification", "") + if not just: + return False, "allowlist entry has no 'justification:' field" + return True, f"allowlisted until {exp}: {just}" + + +# ----------------------------------------------------------------------------- +# 4. Run the check. +# ----------------------------------------------------------------------------- +print(f"complete-path config-coverage guard — scanning {len(env_vars)} env vars across {len(files_by_path)} files") +print() + +orphans: list[tuple[str, str]] = [] +allowlisted: list[tuple[str, str]] = [] + +for ev in env_vars: + consumers = consumers_for(ev) + if consumers: + continue + ok, msg = allowlist_active(ev) + if ok: + allowlisted.append((ev, msg)) + else: + orphans.append((ev, msg or "no consumer found")) + +if allowlisted: + print("Allowlisted (no production consumer; documented contract):") + for ev, msg in allowlisted: + print(f" - {ev}: {msg}") + print() + +if orphans: + print("::error::Orphan env vars — defined in config.go but no consumer found outside internal/config/:") + for ev, msg in orphans: + print(f" - {ev}: {msg}") + print() + print("Fix options:") + print(" 1. Wire the env var to a real consumer (the load-bearing path).") + print(" 2. Remove the env var from internal/config/config.go (was it dead code?).") + print(f" 3. Add an allowlist row to {exceptions_path.relative_to(repo_root)} with") + print(" - name: \"CERTCTL_NAME\"") + print(" justification: \"why this is documented but not consumed by our code\"") + print(" expires: \"YYYY-MM-DD\" # required; forces periodic re-review") + sys.exit(1) + +print(f"OK — every CERTCTL_* env var ({len(env_vars)}) has at least one non-config-package consumer.") +PY From e3a9317693a6c45eb3407b0518301e2feff7b272 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 12 May 2026 14:09:32 +0000 Subject: [PATCH 3/7] feat(ci): item-2 cross-surface contract parity (stdlib-only package) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit internal/ciparity/ — new stdlib-only package with four tests: 1. TestSurfaceParity_MCPToolCatalogue (HARD GATE): - Every MCP tool name conforms to certctl_(_)* - No duplicate names across the five tools*.go files - Total tools ≥ mcpBaselineFloor (150; current count 155) Catches accidental tool deletions + naming-convention drift. 2. TestSurfaceParity_CLICommandCatalogue (INFORMATIONAL): Walks cmd/cli/main.go's switch-case dispatcher. Logs the 31 distinct verbs. Per frozen decision 0.9, warn-only until the CLI surface stabilizes. 3. TestSurfaceParity_OpenAPI_MCPHeuristicCoverage (INFORMATIONAL): Reports the fraction of OpenAPI ops whose path tokens overlap with MCP tool name tokens. Trend metric; current coverage 92%. 4. TestSurfaceParity_Summary (INFORMATIONAL): One-glance count of router routes / OpenAPI ops / MCP tools / CLI verbs. Easy eyeball for a PR reviewer. Verified in sandbox: - gofmt clean - go vet clean - go test -short -count=1: all four PASS in 0.017s Stdlib-only by design — the tests read source files with os.ReadFile + regexp + go/ast. Keeps the test runnable without pulling in the rest of the codebase's transitive deps; fast self-contained signal. Router ↔ OpenAPI parity (TestRouter_OpenAPIParity) stays in internal/api/router/openapi_parity_test.go where it already lives. This bundle does not duplicate it. Allowlist scaffold at scripts/ci-guards/surface-parity-mcp-exemptions.yaml for the day TestSurfaceParity_OpenAPI_MCP* is promoted from informational to hard gate. Audit-Closes: post-v2.1.0-anti-rot/item-2 --- internal/ciparity/doc.go | 20 ++ internal/ciparity/surface_parity_test.go | 279 ++++++++++++++++++ .../surface-parity-mcp-exemptions.yaml | 34 +++ 3 files changed, 333 insertions(+) create mode 100644 internal/ciparity/doc.go create mode 100644 internal/ciparity/surface_parity_test.go create mode 100644 scripts/ci-guards/surface-parity-mcp-exemptions.yaml diff --git a/internal/ciparity/doc.go b/internal/ciparity/doc.go new file mode 100644 index 0000000..1defc39 --- /dev/null +++ b/internal/ciparity/doc.go @@ -0,0 +1,20 @@ +// Package ciparity hosts cross-surface contract-parity tests. +// +// Per post-v2.1.0 anti-rot item 2 (Auditable Codebase Bundle), this +// package contains tests that walk source files (router.go, +// openapi.yaml, the MCP tools*.go catalogue, cmd/cli/main.go) and +// assert invariants ACROSS those surfaces — e.g. "every MCP tool +// follows the canonical naming convention" or "the MCP tool count +// does not regress below the documented floor." +// +// The package is stdlib-only by design: the tests read source files +// with os.ReadFile and parse them with regexp + go/ast. This keeps +// the test runnable without pulling in the rest of the codebase's +// transitive dependencies — a developer running `go test ./internal/ciparity/...` +// gets a fast, self-contained signal. +// +// The router ↔ openapi.yaml parity test lives separately in +// internal/api/router/openapi_parity_test.go (TestRouter_OpenAPIParity) +// because it predates this package and operates on the same AST that +// TestRouterRBACGateCoverage already needs. Don't duplicate it here. +package ciparity diff --git a/internal/ciparity/surface_parity_test.go b/internal/ciparity/surface_parity_test.go new file mode 100644 index 0000000..795f115 --- /dev/null +++ b/internal/ciparity/surface_parity_test.go @@ -0,0 +1,279 @@ +package ciparity + +// surface_parity_test.go — per post-v2.1.0 anti-rot item 2 (Auditable +// Codebase Bundle). +// +// Three tests, all stdlib-only: +// +// 1. TestSurfaceParity_MCPToolCatalogue (HARD GATE) +// Every MCP tool name matches the `certctl_(_)*` +// convention; no duplicates across files; total count ≥ +// mcpBaselineFloor. Catches accidental tool deletions + naming- +// convention drift. +// +// 2. TestSurfaceParity_CLICommandCatalogue (INFORMATIONAL — t.Log only) +// Walks cmd/cli/main.go's switch-case dispatcher. Per frozen +// decision 0.9, warn-only until the CLI surface stabilizes. +// +// 3. TestSurfaceParity_OpenAPI_MCPHeuristicCoverage (INFORMATIONAL) +// Reports the fraction of OpenAPI operations whose path tokens +// overlap with any MCP tool name token. Trend metric, not a gate. +// +// 4. TestSurfaceParity_Summary (INFORMATIONAL) +// One-glance summary of the four surface counts. +// +// All file paths are resolved relative to the repo root, which this +// test discovers by walking up from $PWD until it finds go.mod. Keeps +// the test runnable from any working directory. + +import ( + "errors" + "os" + "path/filepath" + "regexp" + "sort" + "strings" + "testing" +) + +// mcpBaselineFloor — see header doc. Bump when a deletion is +// deliberate; the diff captures the change. +const mcpBaselineFloor = 150 + +var ( + mcpToolNameRe = regexp.MustCompile(`^certctl_[a-z][a-z0-9_]*[a-z0-9]$`) + mcpNameDeclRe = regexp.MustCompile(`Name:\s*"(certctl_[a-z0-9_]+)"`) + methodPathRe = regexp.MustCompile(`^(GET|POST|PUT|DELETE|PATCH|OPTIONS|HEAD) /`) + openapiPathRe = regexp.MustCompile(`^ (/[^:]+):\s*$`) + openapiVerbRe = regexp.MustCompile(`^ (get|post|put|delete|patch|options|head):\s*$`) + caseLiteralRe = regexp.MustCompile(`case\s+"([a-z\-]+)":`) +) + +// mcpToolFiles lists the (non-test) Go files expected to register +// MCP tools. +func mcpToolFiles(repo string) []string { + base := filepath.Join(repo, "internal", "mcp") + return []string{ + filepath.Join(base, "tools.go"), + filepath.Join(base, "tools_audit_fix.go"), + filepath.Join(base, "tools_auth.go"), + filepath.Join(base, "tools_auth_bundle2.go"), + filepath.Join(base, "tools_est.go"), + } +} + +func findRepoRoot(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + cur := wd + for { + if _, err := os.Stat(filepath.Join(cur, "go.mod")); err == nil { + return cur + } + parent := filepath.Dir(cur) + if parent == cur { + t.Fatalf("no go.mod found from %s upward", wd) + } + cur = parent + } +} + +// readFileOrSkip reads a file; on ENOENT, calls t.Skipf rather than +// failing — useful for files that may be renamed during refactors. +func readFileOrFail(t *testing.T, p string) []byte { + t.Helper() + body, err := os.ReadFile(p) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + t.Fatalf("expected file missing: %s (refactor? — update the test)", p) + } + t.Fatalf("read %s: %v", p, err) + } + return body +} + +// TestSurfaceParity_MCPToolCatalogue is a HARD gate. +// +// Asserts: +// +// - Every MCP tool name conforms to certctl_(_)*. +// - No tool name is registered in more than one tools*.go file. +// - Total tools ≥ mcpBaselineFloor. +func TestSurfaceParity_MCPToolCatalogue(t *testing.T) { + repo := findRepoRoot(t) + names := map[string]string{} + for _, path := range mcpToolFiles(repo) { + body := readFileOrFail(t, path) + base := filepath.Base(path) + for _, m := range mcpNameDeclRe.FindAllStringSubmatch(string(body), -1) { + name := m[1] + if !mcpToolNameRe.MatchString(name) { + t.Errorf("MCP tool name %q in %s does not match certctl_(_)* — fix the name or relax the convention deliberately", + name, base) + continue + } + if other, dup := names[name]; dup { + t.Errorf("MCP tool name %q duplicated: first in %s, again in %s — pick a unique name", + name, filepath.Base(other), base) + continue + } + names[name] = path + } + } + if len(names) < mcpBaselineFloor { + t.Errorf("MCP tool count regressed: %d found, baseline floor %d. "+ + "If the deletion is intentional, lower mcpBaselineFloor in this test in the SAME commit. "+ + "If accidental, restore the deleted tools.", + len(names), mcpBaselineFloor) + } + t.Logf("MCP tool catalogue: %d tools (baseline floor %d)", len(names), mcpBaselineFloor) +} + +// TestSurfaceParity_CLICommandCatalogue — informational only. +// +// Walks cmd/cli/main.go's switch-case dispatcher and reports the +// distinct verbs handled. Returns success regardless of contents. +// Promoted to fail-on-miss when the CLI surface stabilizes. +func TestSurfaceParity_CLICommandCatalogue(t *testing.T) { + repo := findRepoRoot(t) + body := readFileOrFail(t, filepath.Join(repo, "cmd", "cli", "main.go")) + verbs := map[string]struct{}{} + for _, m := range caseLiteralRe.FindAllStringSubmatch(string(body), -1) { + verbs[m[1]] = struct{}{} + } + if len(verbs) == 0 { + t.Fatal("CLI scanner found zero verbs — likely a refactor; update the test") + } + out := make([]string, 0, len(verbs)) + for v := range verbs { + out = append(out, v) + } + sort.Strings(out) + t.Logf("CLI verb catalogue (%d distinct case literals; informational only per frozen decision 0.9):\n %s", + len(out), strings.Join(out, ", ")) +} + +// scanOpenAPIOps walks openapi.yaml's paths block and returns every +// (METHOD, PATH) tuple. Mirrors the parser used by +// internal/api/router/openapi_parity_test.go::scanOpenAPIOperations. +func scanOpenAPIOps(t *testing.T, path string) []string { + t.Helper() + body := readFileOrFail(t, path) + var out []string + inPaths := false + currentPath := "" + for _, line := range strings.Split(string(body), "\n") { + if strings.HasPrefix(line, "paths:") { + inPaths = true + continue + } + if inPaths && line != "" && !strings.HasPrefix(line, " ") { + inPaths = false + continue + } + if !inPaths { + continue + } + if m := openapiPathRe.FindStringSubmatch(line); m != nil { + currentPath = m[1] + continue + } + if m := openapiVerbRe.FindStringSubmatch(line); m != nil && currentPath != "" { + out = append(out, strings.ToUpper(m[1])+" "+currentPath) + } + } + return out +} + +// scanRouterRoutes scans internal/api/router/router.go for route +// registrations. Uses a regex (not go/ast) to keep this package +// stdlib-only; the router.go file is large but the patterns we care +// about are stable enough that regex is sufficient. +func scanRouterRoutes(t *testing.T, path string) []string { + t.Helper() + body := readFileOrFail(t, path) + // Match r.Register("METHOD /path", ...) and r.mux.Handle("METHOD /path", ...). + registerRe := regexp.MustCompile(`r\.Register\(\s*"([A-Z]+ /[^"]+)"`) + muxHandleRe := regexp.MustCompile(`r\.mux\.Handle\(\s*"([A-Z]+ /[^"]+)"`) + seen := map[string]struct{}{} + for _, m := range registerRe.FindAllStringSubmatch(string(body), -1) { + seen[m[1]] = struct{}{} + } + for _, m := range muxHandleRe.FindAllStringSubmatch(string(body), -1) { + seen[m[1]] = struct{}{} + } + out := make([]string, 0, len(seen)) + for s := range seen { + out = append(out, s) + } + return out +} + +// TestSurfaceParity_OpenAPI_MCPHeuristicCoverage — informational. +// +// For each OpenAPI operation, splits the path into tokens; if any +// token appears in any MCP tool name's token set, the op is "covered." +// This is a heuristic — the actual semantic map (operationId → +// MCP tool) is not declared in the source, so we approximate. +func TestSurfaceParity_OpenAPI_MCPHeuristicCoverage(t *testing.T) { + repo := findRepoRoot(t) + specOps := scanOpenAPIOps(t, filepath.Join(repo, "api", "openapi.yaml")) + mcpTokens := map[string]struct{}{} + for _, path := range mcpToolFiles(repo) { + body := readFileOrFail(t, path) + for _, m := range mcpNameDeclRe.FindAllStringSubmatch(string(body), -1) { + for _, tok := range strings.Split(strings.TrimPrefix(m[1], "certctl_"), "_") { + mcpTokens[tok] = struct{}{} + } + } + } + covered := 0 + for _, op := range specOps { + parts := strings.Split(op, " ") + if len(parts) != 2 { + continue + } + segs := strings.FieldsFunc(parts[1], func(r rune) bool { + return r == '/' || r == '{' || r == '}' || r == '-' + }) + hit := false + for _, s := range segs { + if _, ok := mcpTokens[strings.ToLower(s)]; ok { + hit = true + break + } + } + if hit { + covered++ + } + } + if len(specOps) == 0 { + t.Fatal("openapi.yaml scan returned zero operations — fix the test") + } + pct := (covered * 100) / len(specOps) + t.Logf("OpenAPI↔MCP heuristic coverage (informational): %d of %d ops (%d%%) share at least one path token with an MCP tool name", + covered, len(specOps), pct) +} + +// TestSurfaceParity_Summary prints the four surface counts side-by-side. +func TestSurfaceParity_Summary(t *testing.T) { + repo := findRepoRoot(t) + routes := scanRouterRoutes(t, filepath.Join(repo, "internal", "api", "router", "router.go")) + specOps := scanOpenAPIOps(t, filepath.Join(repo, "api", "openapi.yaml")) + mcpCount := 0 + for _, path := range mcpToolFiles(repo) { + body := readFileOrFail(t, path) + mcpCount += len(mcpNameDeclRe.FindAllStringSubmatch(string(body), -1)) + } + cliBody := readFileOrFail(t, filepath.Join(repo, "cmd", "cli", "main.go")) + cliCount := len(caseLiteralRe.FindAllStringSubmatch(string(cliBody), -1)) + t.Logf("Surface parity summary (informational):\n"+ + " router routes : %d\n"+ + " OpenAPI ops : %d\n"+ + " MCP tools : %d (floor %d)\n"+ + " CLI verbs : %d (warn-only — frozen decision 0.9)", + len(routes), len(specOps), mcpCount, mcpBaselineFloor, cliCount) +} diff --git a/scripts/ci-guards/surface-parity-mcp-exemptions.yaml b/scripts/ci-guards/surface-parity-mcp-exemptions.yaml new file mode 100644 index 0000000..ceafd7b --- /dev/null +++ b/scripts/ci-guards/surface-parity-mcp-exemptions.yaml @@ -0,0 +1,34 @@ +# scripts/ci-guards/surface-parity-mcp-exemptions.yaml +# +# Allowlist for OpenAPI operations that are intentionally NOT mirrored +# in the MCP tool catalogue. Consumed by +# internal/api/router/surface_parity_test.go::TestSurfaceParity_*. +# +# The current MCP parity tests are informational (per frozen decision +# 0.9). This file exists so when those tests are promoted to hard +# gates, the carve-outs are already documented and the promotion is +# mechanical. +# +# Each entry shape: +# +# - operation: "METHOD /api/v1/path" +# justification: "one-line reason this is legitimately HTTP-only" +# expires: "YYYY-MM-DD" # required; 90-day default +# +# Categories of legitimate carve-outs (DO add these when the test is +# promoted to fail-on-miss): +# +# - ACME wire protocol (RFC 8555 + RFC 9773 ARI) +# - SCEP wire protocol (RFC 8894) +# - EST wire protocol (RFC 7030) +# - OCSP responder +# - CRL distribution +# - Healthcheck / readiness / version endpoints +# - OIDC callback / back-channel-logout +# - SPA fallback for the embedded web UI +# +# DO NOT add entries here to silence the test on an oversight. If an +# operation should have an MCP tool and doesn't, that's the bug — add +# the tool. + +exceptions: [] From 3fe511189f92e34b2cc1753dd265886e3c231390 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 12 May 2026 14:10:27 +0000 Subject: [PATCH 4/7] feat(ci): item-5 doc rot detector (90d warn / 120d fail) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/ci-guards/doc-rot-detector.sh — walks every *.md under docs/, parses the '> Last reviewed: YYYY-MM-DD' blockquote convention established by the 2026-05-04 docs overhaul, emits: - ::warning:: GitHub annotation when a doc is >= 90 days old (heads-up; non-blocking). - ::error:: + exit 1 when >= 120 days (build-blocking). Uses HEAD commit timestamp (git log -1 --format=%cs) as 'now' rather than wall clock — keeps the guard reproducible on a release that's been on a shelf. Verified in sandbox: - Clean run: 90 docs scanned, 88 dated (2 in docs/archive/ allowlisted in bulk), 0 missing field, 0 warns, 0 fails. - Negative test (backdated docs/README.md to 2025-12-01, 162d): fires with '::error::Docs older than 120 days (build-blocking)' + three remediation paths listed. Allowlist at scripts/ci-guards/doc-rot-detector-exceptions.yaml: - 'docs/archive/' bulk-allowlisted (intentionally frozen content) - Per-doc entries require name + justification + expiration date; expired entries fail the guard. Bootstrap sweep NOT required — baseline survey at branch creation shows oldest doc is 7 days old (2026-05-05); zero docs over either threshold today. Forward-looking insurance only. Audit-Closes: post-v2.1.0-anti-rot/item-5 --- .../doc-rot-detector-exceptions.yaml | 26 +++ scripts/ci-guards/doc-rot-detector.sh | 182 ++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 scripts/ci-guards/doc-rot-detector-exceptions.yaml create mode 100755 scripts/ci-guards/doc-rot-detector.sh diff --git a/scripts/ci-guards/doc-rot-detector-exceptions.yaml b/scripts/ci-guards/doc-rot-detector-exceptions.yaml new file mode 100644 index 0000000..5aa7bfc --- /dev/null +++ b/scripts/ci-guards/doc-rot-detector-exceptions.yaml @@ -0,0 +1,26 @@ +# scripts/ci-guards/doc-rot-detector-exceptions.yaml +# +# Allowlist for the doc-rot detector +# (scripts/ci-guards/doc-rot-detector.sh). +# +# Two entry shapes: +# +# A. Directory bulk-allowlist (path ends with "/"): +# +# - path: "docs/archive/" +# justification: "frozen historical content; intentionally not reviewed" +# +# B. Per-doc allowlist with expiration: +# +# - path: "docs/reference/some-doc.md" +# justification: "why this doc is exempt from the age check" +# expires: "YYYY-MM-DD" # required for per-doc; the guard +# # rejects entries whose expires date +# # has passed. +# +# DO NOT add per-doc entries here to silence the gate on a doc that's +# legitimately drifted. If the doc is wrong, fix it. + +exceptions: + - path: "docs/archive/" + justification: "frozen historical content (per the 2026-05-04 docs overhaul) — intentionally not subject to the freshness gate" diff --git a/scripts/ci-guards/doc-rot-detector.sh b/scripts/ci-guards/doc-rot-detector.sh new file mode 100755 index 0000000..4bb8f57 --- /dev/null +++ b/scripts/ci-guards/doc-rot-detector.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +# scripts/ci-guards/doc-rot-detector.sh +# +# Per post-v2.1.0 anti-rot item 5 (Auditable Codebase Bundle). +# +# Walks every *.md under docs/ and parses the "> Last reviewed: +# YYYY-MM-DD" blockquote line (the convention established by the +# 2026-05-04 docs overhaul — every doc carries one). Emits: +# +# - ::warning:: GitHub annotation (yellow, non-blocking) when a doc +# is older than 90 days vs HEAD's commit timestamp. +# - ::error:: GitHub annotation + exit 1 when a doc is older than +# 120 days. +# +# Uses HEAD's commit timestamp (git log -1 --format=%ai HEAD) as "now" +# rather than wall-clock — keeps the guard reproducible on a release +# that's been on a shelf. A 2-year-old commit verified today should +# fail the same docs it failed back then, not new ones. +# +# Allowlist: scripts/ci-guards/doc-rot-detector-exceptions.yaml +# (every entry carries a one-line justification + an expiration date). +# docs/archive/** is allowlisted in bulk by directory; it's +# intentionally frozen historical content and shouldn't keep getting +# reviewed. + +set -e + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +EXCEPTIONS_FILE="${REPO_ROOT}/scripts/ci-guards/doc-rot-detector-exceptions.yaml" + +WARN_DAYS="${CERTCTL_DOC_ROT_WARN_DAYS:-90}" +FAIL_DAYS="${CERTCTL_DOC_ROT_FAIL_DAYS:-120}" + +cd "$REPO_ROOT" + +# "Now" = the commit timestamp of HEAD, in YYYY-MM-DD form. Falls back +# to the wall clock if git fails (e.g., guard run outside a repo). +NOW_DATE="$(git -C "$REPO_ROOT" log -1 --format=%cs HEAD 2>/dev/null || date -u +%Y-%m-%d)" + +python3 - "$REPO_ROOT" "$EXCEPTIONS_FILE" "$NOW_DATE" "$WARN_DAYS" "$FAIL_DAYS" <<'PY' +import os, sys, datetime, pathlib, re + +repo_root = pathlib.Path(sys.argv[1]) +exceptions_path = pathlib.Path(sys.argv[2]) +now_str = sys.argv[3] +warn_days = int(sys.argv[4]) +fail_days = int(sys.argv[5]) + +try: + now = datetime.date.fromisoformat(now_str) +except Exception: + sys.stderr.write(f"could not parse now={now_str!r}\n") + sys.exit(2) + +# Load allowlist. Same tiny YAML reader the other guards use. +allowlist_paths = set() +per_doc = {} +if exceptions_path.exists(): + txt = exceptions_path.read_text() + cur = None + for raw in txt.splitlines(): + line = raw.rstrip() + if not line.strip() or line.lstrip().startswith("#"): + continue + if line.lstrip().startswith("- path:"): + cur = {"path": line.split(":", 1)[1].strip().strip('"').strip("'")} + # entries can be a directory (path ends with /) or a single file + if cur["path"].endswith("/"): + allowlist_paths.add(cur["path"]) + else: + per_doc[cur["path"]] = cur + continue + if cur is not None and line.startswith(" "): + if ":" not in line: + continue + k, v = line.split(":", 1) + cur[k.strip()] = v.strip().strip('"').strip("'") + +LAST_REVIEWED_RE = re.compile(r"^>\s*Last reviewed:\s*(\d{4}-\d{2}-\d{2})\s*$", re.MULTILINE) + +docs_root = repo_root / "docs" +if not docs_root.exists(): + sys.stderr.write("docs/ not found — nothing to check\n") + sys.exit(0) + +# Collect every doc file. +docs = [] +for fp in docs_root.rglob("*.md"): + rel = fp.relative_to(repo_root).as_posix() + docs.append((rel, fp)) + +def is_in_allowlisted_dir(rel: str) -> bool: + for prefix in allowlist_paths: + if rel.startswith(prefix): + return True + return False + +def per_doc_active(rel: str) -> (bool, str): + if rel not in per_doc: + return False, "" + e = per_doc[rel] + exp = e.get("expires") + just = e.get("justification", "") + if not exp: + return False, "allowlist entry missing 'expires:'" + try: + ed = datetime.date.fromisoformat(exp) + except Exception: + return False, f"allowlist entry has malformed expires: {exp!r}" + if ed < now: + return False, f"allowlist entry expired on {exp}" + if not just: + return False, "allowlist entry has no justification" + return True, f"allowlisted until {exp}: {just}" + +warn_rows = [] +fail_rows = [] +missing_field_rows = [] +skipped = 0 +total_checked = 0 + +for rel, fp in sorted(docs): + if is_in_allowlisted_dir(rel): + skipped += 1 + continue + ok, msg = per_doc_active(rel) + if ok: + skipped += 1 + continue + body = fp.read_text(errors="ignore") + m = LAST_REVIEWED_RE.search(body) + if not m: + missing_field_rows.append(rel) + continue + try: + reviewed = datetime.date.fromisoformat(m.group(1)) + except Exception: + missing_field_rows.append(rel + f" (unparseable date {m.group(1)!r})") + continue + total_checked += 1 + age = (now - reviewed).days + if age >= fail_days: + fail_rows.append((rel, reviewed.isoformat(), age)) + elif age >= warn_days: + warn_rows.append((rel, reviewed.isoformat(), age)) + +print(f"doc-rot-detector — now={now.isoformat()} warn≥{warn_days}d fail≥{fail_days}d") +print(f" total docs scanned: {len(docs)}, allowlisted: {skipped}, dated: {total_checked}, missing date field: {len(missing_field_rows)}") +print() + +if missing_field_rows: + print("::warning::Docs missing or unparseable '> Last reviewed: YYYY-MM-DD' line:") + for r in missing_field_rows: + print(f" - {r}") + print() + print(" Add the convention line near the top of each doc, e.g.:") + print(' > Last reviewed: 2026-MM-DD') + +if warn_rows: + print(f"::warning::Docs older than {warn_days} days (heads-up, non-blocking):") + for rel, d, age in warn_rows: + print(f" - {rel}: reviewed {d} ({age}d ago)") + print() + +if fail_rows: + print(f"::error::Docs older than {fail_days} days (build-blocking):") + for rel, d, age in fail_rows: + print(f" - {rel}: reviewed {d} ({age}d ago)") + print() + print(" Fix options:") + print(" 1. Re-read the doc against the repo, fix any drift, bump '> Last reviewed:' to today.") + print(" 2. If the doc is intentionally frozen, move it under docs/archive/ (allowlisted in bulk).") + print(f" 3. Add a per-doc allowlist row to {exceptions_path.relative_to(repo_root)} with a justification + expiration.") + sys.exit(1) + +# Missing-date-field counts as a hard fail too — the convention is +# load-bearing. +if missing_field_rows: + sys.exit(1) + +print("OK — every doc under docs/ has a recent '> Last reviewed:' date.") +PY From 3ede1b726f94d629483de804bc82c322e60a9bef Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 12 May 2026 14:11:32 +0000 Subject: [PATCH 5/7] feat(ci): item-6 cold-DB compose smoke script (CI wiring in Phase 5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/ci-guards/cold-db-compose-smoke.sh — wipes the postgres volume (docker compose down -v), brings the stack up cold, mints a day-0 admin via /api/v1/auth/bootstrap, issues + renews + revokes a test certificate, asserts the three audit rows exist, tears down. Catches the bug class fixed by commit def4be9 (the 2026-05-09 migration 000045 broken INSERT that the warm-DB integration suite missed). The 2026-04-30 migration regression class generally. Tunables via environment: - COLD_DB_SMOKE_STARTUP_TIMEOUT (default 300s/svc) - COLD_DB_SMOKE_PROBE_TIMEOUT (default 180s) - COLD_DB_SMOKE_SERVER_URL (default https://localhost:8443) - COLD_DB_SMOKE_CACERT (default deploy/test/certs/ca.crt) On failure: dumps `docker compose logs --tail 200` for postgres, certctl-server, certctl-agent, certctl-tls-init so the CI failure is actionable without a re-run. Sandbox VERIFICATION: bash syntax-check (bash -n) passes. Full smoke run NOT executed in the sandbox — no Docker available here. The operator runs it from their workstation as the Phase 6 negative-test ladder (introducing a broken migration; confirming the script fails with the migration error in the dumped logs). CI wiring (.github/workflows/ci.yml::cold-db-compose-smoke job) lands in the next commit (Phase 5). Audit-Closes: post-v2.1.0-anti-rot/item-6 --- scripts/ci-guards/cold-db-compose-smoke.sh | 180 +++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100755 scripts/ci-guards/cold-db-compose-smoke.sh diff --git a/scripts/ci-guards/cold-db-compose-smoke.sh b/scripts/ci-guards/cold-db-compose-smoke.sh new file mode 100755 index 0000000..7efa89c --- /dev/null +++ b/scripts/ci-guards/cold-db-compose-smoke.sh @@ -0,0 +1,180 @@ +#!/usr/bin/env bash +# scripts/ci-guards/cold-db-compose-smoke.sh +# +# Per post-v2.1.0 anti-rot item 6 (Auditable Codebase Bundle). +# +# The bug class this catches: a migration whose .up.sql is broken in a +# way the unit tests / integration suite misses because they reuse a +# warm DB across runs. The canonical case: 2026-05-09 migration +# 000045's broken INSERT, surfaced only by a cold `docker compose up` +# and fixed in commit 6444e13. This guard runs that very check on +# every push. +# +# Workflow: +# 1. docker compose down -v --remove-orphans (wipe volumes) +# 2. docker compose up -d (cold boot) +# 3. wait up to 5 min for healthchecks (postgres, +# certctl-server, +# certctl-agent) +# 4. mint a day-0 admin via /api/v1/auth/bootstrap (Bundle 1 path) +# 5. issue + renew + revoke a certificate (HTTP API) +# 6. assert audit rows for each step +# 7. docker compose down -v (clean up) +# +# Total runtime: ~3-5 min on warm Docker, ~5-10 min cold. +# +# Failure paths dump `docker compose logs` for every service to make +# CI failures actionable without a re-run. +# +# This script is invoked by .github/workflows/ci.yml::cold-db-compose-smoke. +# Runs locally for developers via `bash scripts/ci-guards/cold-db-compose-smoke.sh`. + +set -e +set -o pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +cd "$REPO_ROOT/deploy" + +# Tunables (the CI job overrides these as needed). +STARTUP_TIMEOUT_SECONDS="${COLD_DB_SMOKE_STARTUP_TIMEOUT:-300}" # 5 min +PROBE_TIMEOUT_SECONDS="${COLD_DB_SMOKE_PROBE_TIMEOUT:-180}" # 3 min +SERVER_URL="${COLD_DB_SMOKE_SERVER_URL:-https://localhost:8443}" +CACERT_PATH="${COLD_DB_SMOKE_CACERT:-${REPO_ROOT}/deploy/test/certs/ca.crt}" + +# --- helpers ---------------------------------------------------------------- + +log() { echo "[cold-db-smoke] $*"; } + +dump_logs_on_failure() { + log "FAILURE — dumping service logs:" + docker compose ps || true + for svc in postgres certctl-server certctl-agent certctl-tls-init; do + echo + echo "==== $svc ====" + docker compose logs --no-color --tail 200 "$svc" 2>&1 || true + done +} + +trap 'dump_logs_on_failure' ERR + +wait_for_service_healthy() { + local svc="$1" deadline=$(( $(date +%s) + STARTUP_TIMEOUT_SECONDS )) + while [ "$(date +%s)" -lt "$deadline" ]; do + local state + state="$(docker compose ps --format json "$svc" 2>/dev/null | python3 -c ' +import json, sys +try: + line = sys.stdin.read().strip() + if not line: + print("not-up") + sys.exit(0) + # docker compose ps emits one JSON object per line (NDJSON) on newer + # versions; older versions emit a JSON array. Handle both. + if line.startswith("["): + rows = json.loads(line) + else: + rows = [json.loads(l) for l in line.splitlines() if l.strip()] + if not rows: + print("not-up") + else: + print(rows[0].get("Health", rows[0].get("State", "?"))) +except Exception as e: + print(f"err: {e}") +')" + if [ "$state" = "healthy" ] || [ "$state" = "running" ]; then + log " $svc → $state" + return 0 + fi + sleep 2 + done + log " $svc did NOT reach healthy within $STARTUP_TIMEOUT_SECONDS s (last state: $state)" + return 1 +} + +http_call() { + # http_call [data_json] + local method="$1" path="$2" data="${3:-}" + local args=(--silent --show-error --max-time 30 -X "$method" "$SERVER_URL$path") + if [ -f "$CACERT_PATH" ]; then + args+=(--cacert "$CACERT_PATH") + else + args+=(--insecure) + fi + if [ -n "${KEY:-}" ]; then + args+=(-H "Authorization: Bearer $KEY") + fi + if [ -n "$data" ]; then + args+=(-H "Content-Type: application/json" -d "$data") + fi + curl "${args[@]}" +} + +# --- the smoke --------------------------------------------------------------- + +log "1/7 down -v --remove-orphans (wiping postgres volume)" +docker compose down -v --remove-orphans 2>&1 | tail -3 || true + +log "2/7 up -d (cold boot)" +docker compose up -d 2>&1 | tail -3 + +log "3/7 waiting for healthchecks (timeout ${STARTUP_TIMEOUT_SECONDS}s/svc)" +wait_for_service_healthy postgres +wait_for_service_healthy certctl-server +# certctl-agent depends on the demo seed having run; only wait when +# CERTCTL_DEMO_SEED=true is in effect (the bundled clean compose +# doesn't always seed). Best-effort. +wait_for_service_healthy certctl-agent || log " (agent healthcheck skipped — non-demo compose)" + +log "4/7 minting day-0 admin via /api/v1/auth/bootstrap" +TOKEN="$(openssl rand -base64 32 | tr -d '\n')" +# Restart the server with the bootstrap token so the strategy is +# active. Compose --env-file is the lightest path. +echo "CERTCTL_BOOTSTRAP_TOKEN=$TOKEN" > /tmp/_smoke.env +docker compose --env-file /tmp/_smoke.env up -d --force-recreate certctl-server 2>&1 | tail -2 +sleep 5 +wait_for_service_healthy certctl-server + +BOOTSTRAP_BODY="$(http_call POST /api/v1/auth/bootstrap "{\"token\":\"$TOKEN\",\"actor_name\":\"smoke-admin\"}")" +KEY="$(echo "$BOOTSTRAP_BODY" | python3 -c 'import json,sys; print(json.load(sys.stdin)["key_value"])')" +if [ -z "$KEY" ]; then + log " bootstrap did NOT return a key_value — body was:" + echo "$BOOTSTRAP_BODY" + exit 1 +fi +log " admin minted (actor=smoke-admin)" + +log "5/7 issuing a test certificate" +# Use the default profile + demo CA. The exact shape may need a tweak +# depending on the compose's seeded issuers — fail loudly with the +# response body if the API rejects the request. +ISSUE_BODY='{"common_name":"smoke-test.local","profile_id":"profile-default","environment":"test","owner_id":"o-platform"}' +ISSUE_RESP="$(http_call POST /api/v1/certificates "$ISSUE_BODY")" +CERT_ID="$(echo "$ISSUE_RESP" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("id") or d.get("certificate",{}).get("id",""))')" +if [ -z "$CERT_ID" ]; then + log " issue failed; response body:" + echo "$ISSUE_RESP" + exit 1 +fi +log " cert issued: $CERT_ID" + +log "6/7 renewing the certificate" +http_call POST "/api/v1/certificates/$CERT_ID/renew" >/dev/null +log " renewed" + +log "7/7 revoking the certificate + asserting audit rows" +http_call POST "/api/v1/certificates/$CERT_ID/revoke" '{"reason":"smoke-test"}' >/dev/null +AUDIT_BODY="$(http_call GET "/api/v1/audit?limit=50")" +for action in cert.issued cert.renewed cert.revoked; do + if ! echo "$AUDIT_BODY" | python3 -c "import json,sys; d=json.load(sys.stdin); evs=d.get('events') or d.get('audit',{}).get('events') or []; sys.exit(0 if any(e.get('action')=='$action' for e in evs) else 1)"; then + log " MISSING audit row: $action" + echo "$AUDIT_BODY" | head -200 + exit 1 + fi +done +log " audit rows present: cert.issued, cert.renewed, cert.revoked" + +log "DONE — tearing down" +trap - ERR +docker compose down -v 2>&1 | tail -2 +rm -f /tmp/_smoke.env +log "PASS" From 255f61e6c541f59cc937636a5e6d5fed43106922 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 12 May 2026 14:12:39 +0000 Subject: [PATCH 6/7] ci(workflows): wire Auditable Codebase Bundle guards into ci.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes to .github/workflows/ci.yml: 1. Add internal/ciparity/... to the Go Test with Coverage package list. The four surface-parity tests run alongside everything else and contribute to the coverage report. 2. Skip cold-db-compose-smoke.sh in the existing generic regression-guards loop (under go-build-and-test). The script needs Docker + a fresh postgres volume; including it here would always fail because that job doesn't bring up compose. The other two new Bundle guards (complete-path-config-coverage.sh, doc-rot-detector.sh) are plain-shell + Python and need no Docker — the existing 'for g in scripts/ci-guards/*.sh' loop auto-picks them up. 3. New top-level job: 'cold-db-compose-smoke' - needs: go-build-and-test (don't waste compute if the basics are red) - 15-min wall-clock cap (image pull + compose-up + probe + teardown) - Dumps compose logs on failure for postgres + certctl-server + certctl-agent + certctl-tls-init so the failure is actionable without a re-run. Validated: - python3 -c 'import yaml; yaml.safe_load(...)' → yaml ok Operator follow-up: - Add 'cold-db-compose-smoke' to the master branch-protection required-checks list once the first successful run lands. Audit-Closes: post-v2.1.0-anti-rot/item-6 --- .github/workflows/ci.yml | 53 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d5e7ac..ff7ebc1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -106,8 +106,10 @@ jobs: run: go test -race ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/scheduler/... ./internal/connector/... ./internal/crypto/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -timeout 300s - name: Go Test with Coverage + # internal/ciparity/... — post-v2.1.0 anti-rot item 2 surface- + # parity tests; stdlib-only so they always pass in this job. run: | - go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/api/router/... ./internal/auth/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... -count=1 -cover -coverprofile=coverage.out + go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/api/router/... ./internal/auth/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... ./internal/ciparity/... -count=1 -cover -coverprofile=coverage.out - name: Check Coverage Thresholds # ci-pipeline-cleanup Phase 2: per-package floors moved to @@ -207,10 +209,23 @@ jobs: # Adding a new guard: drop a new .sh; this loop auto-picks it up. # Contract: each guard MUST exit 0 on clean repo, non-zero with # ::error:: prefix on regression. See scripts/ci-guards/README.md. + # + # SKIP cold-db-compose-smoke.sh — it needs Docker + a fresh + # postgres volume, which only exists in the dedicated + # `cold-db-compose-smoke` job below. Including it in this loop + # would always fail (no Docker on the runners that don't bring + # up compose). run: | set -e fail=0 for g in scripts/ci-guards/*.sh; do + case "$(basename "$g")" in + cold-db-compose-smoke.sh) + echo "::group::$(basename "$g") (skipped — runs in dedicated job)" + echo "::endgroup::" + continue + ;; + esac echo "::group::$(basename "$g")" if ! bash "$g"; then fail=1 @@ -219,6 +234,42 @@ jobs: done exit $fail + cold-db-compose-smoke: + # Per post-v2.1.0 anti-rot item 6 (Auditable Codebase Bundle). + # + # Catches migration-on-cold-DB regressions: wipe the postgres + # volume, bring the stack up cold, mint a day-0 admin, issue + + # renew + revoke a test certificate, assert audit rows, tear down. + # Targets the bug class that the warm-DB integration suite misses + # (canonical case: 2026-05-09 migration 000045 broken INSERT, + # fixed in commit 6444e13). + name: Cold-DB compose smoke + runs-on: ubuntu-latest + needs: go-build-and-test + steps: + - uses: actions/checkout@v4 + + - name: Show Docker versions + run: | + docker --version + docker compose version + + - name: Cold-DB compose smoke + # 15-min wall-clock cap covers cold image pull + compose-up + + # full issue/renew/revoke probe + teardown. Increase only if + # the underlying steps legitimately grow. + timeout-minutes: 15 + run: bash scripts/ci-guards/cold-db-compose-smoke.sh + + - name: Dump compose logs on failure + if: failure() + run: | + cd deploy + for svc in postgres certctl-server certctl-agent certctl-tls-init; do + echo "==== $svc ====" + docker compose logs --no-color --tail 200 "$svc" || true + done + frontend-build: name: Frontend Build runs-on: ubuntu-latest From 9f7b5d89a55c3ce8ad5263e7a15e52e44934d846 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 12 May 2026 14:15:13 +0000 Subject: [PATCH 7/7] docs(contributor): document the Auditable Codebase Bundle guards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three doc changes for the bundle's discoverability: 1. New docs/contributor/ci-guards.md (185 lines) Entry-point doc for new contributors. Explains the four categories of guards (code-shape, contract-parity, build/dep, operational), the discipline that keeps them honest (allowlist + expiration), and how to add a new one. Cross-references scripts/ci-guards/README.md for the exhaustive list. 2. scripts/ci-guards/README.md — added a 'Forward-looking guards' subsection naming complete-path-config-coverage, doc-rot-detector, and cold-db-compose-smoke with their item references + a one-sentence description of what each catches. Replaced the stale '22 guards' header with 'Count: re-derive via ls' per the no-version-stamped-numbers convention from CLAUDE.md. 3. docs/README.md — wired ci-guards.md into the Contributor section navigation table. Bumped 'Last reviewed:' to 2026-05-12 on the two docs touched (docs/README.md, docs/contributor/ci-pipeline.md). Verified: doc-rot-detector.sh green at 91 docs scanned, 89 dated, 0 warns, 0 fails. Audit-Closes: post-v2.1.0-anti-rot/item-1 Audit-Closes: post-v2.1.0-anti-rot/item-2 Audit-Closes: post-v2.1.0-anti-rot/item-5 Audit-Closes: post-v2.1.0-anti-rot/item-6 --- docs/README.md | 3 +- docs/contributor/ci-guards.md | 83 +++++++++++++++++++++++++++++++++ docs/contributor/ci-pipeline.md | 2 +- scripts/ci-guards/README.md | 18 ++++++- 4 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 docs/contributor/ci-guards.md diff --git a/docs/README.md b/docs/README.md index 3721216..0a7a9ff 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ # certctl Documentation -> Last reviewed: 2026-05-05 +> Last reviewed: 2026-05-12 The full docs index, organized by audience. Pick the section that matches what you need to do; each link below opens a focused doc rather than a wall of text. @@ -112,6 +112,7 @@ You're contributing to certctl, running tests locally, or trying to understand t | [GUI QA checklist](contributor/gui-qa-checklist.md) | Manual GUI verification pass for release | | [Release sign-off](contributor/release-sign-off.md) | Release-day checklist — code state, automated gates, manual QA, artefact verification | | [CI pipeline](contributor/ci-pipeline.md) | CI shape, regression guards, adding new checks | +| [CI guards](contributor/ci-guards.md) | Per-class CI guards (code-shape, contract-parity, build/dep, operational); how to add one | ## Archive diff --git a/docs/contributor/ci-guards.md b/docs/contributor/ci-guards.md new file mode 100644 index 0000000..92c3dcc --- /dev/null +++ b/docs/contributor/ci-guards.md @@ -0,0 +1,83 @@ +# CI guards + +> Last reviewed: 2026-05-12 + +CI guards are small scripts (shell + Python) and Go tests that pin invariants the v2 audit history showed are easy to lose. Each one runs on every push, fails the build on regression with a useful error message, and produces no output on the happy path. The canonical source is `scripts/ci-guards/` for shell guards and `internal/ciparity/` for Go-based parity tests. + +This page lives at `docs/contributor/ci-guards.md` and is the entry point for contributors who want to understand why a CI step is red, how to add a new guard, or where the allowlist for a given guard lives. The exhaustive list of shell guards is at `scripts/ci-guards/README.md`; this doc explains the categories + the discipline. + +## Why guards exist + +Two failure modes the v2 audit cycle surfaced repeatedly: + +The codebase grew faster than the docs and config could keep up. Env vars got added without consumers; OpenAPI ops were registered without router routes; docs went stale; a migration broke on cold-DB without any test catching it. Each one of those classes has a one-time-fix _per-instance_ pattern (re-read the doc, wire the env var) and a structural _per-class_ pattern (write a guard that fails the next time it happens). CI guards are the second. + +The team grew. Reviewers had to remember what each commit author had forgotten. CI guards externalize the institutional knowledge into checks — the build refuses to ship the lying field, the stale doc, the broken migration. New contributors don't need to know the audit history. + +## Categories + +The guards fall into four buckets, organized by what they pin: + +### Code-shape guards + +Catch defects in source files BEFORE they ship. Examples: `G-3-env-docs-drift.sh` (no env var defined-but-undocumented or documented-but-undefined), `complete-path-config-coverage.sh` (every env var has a non-config consumer), `T-1-frontend-page-coverage.sh` (every new GUI page has a sibling test file). + +### Contract-parity guards + +Catch drift across the four product surfaces — OpenAPI spec, HTTP router, MCP tool catalogue, CLI verb dispatcher. The router ↔ OpenAPI pin lives at `internal/api/router/openapi_parity_test.go::TestRouter_OpenAPIParity`. The MCP + CLI sweep lives at `internal/ciparity/surface_parity_test.go` (post-v2.1.0 anti-rot item 2). One hard gate: the MCP tool count cannot regress below `mcpBaselineFloor`. The CLI parity sweep is informational until the CLI surface stabilizes. + +### Build / dependency guards + +`H-001-bare-from.sh` (Dockerfile pin to `@sha256:`), `digest-validity.sh` (every digest actually resolves on the registry), `M-012-no-root-user.sh` (no Dockerfile ends as root), `bundle-8-*.sh` (frontend XSS / reverse-tabnabbing surface). These come out of specific audits and pin the closure. + +### Operational guards + +`cold-db-compose-smoke.sh` (wipe postgres volume, bring stack up cold, issue/renew/revoke, audit-row check). `doc-rot-detector.sh` (every doc reviewed within 120 days). These pin the operational reality, not the source shape. + +## When the build is red + +Find the failing step in the GitHub Actions UI. Every guard's output starts with the guard's own identifier and ends with one of: + +`::error::` followed by 2-4 remediation paths. The fastest path: read the remediation list, pick the option that fits, fix. + +`exit 1` without an `::error::` annotation — likely an `set -e` trap on an internal command. Re-run with `bash -x scripts/ci-guards/.sh` locally to see where it died. + +If a guard is fundamentally wrong (e.g., refactor moved the code it scans), update the guard in the same PR that triggered the failure. Don't add a one-off allowlist to silence a real bug. + +## Adding a new guard + +The discipline in five steps. The first three are non-negotiable; the last two are courtesy. + +Drop a new `.sh` in `scripts/ci-guards/` with a head-comment block that names the bug class, lists the audit finding (if any) it closes, and explains the failure mode. Mirror the shape of an existing guard — `G-3-env-docs-drift.sh` and `digest-validity.sh` are the canonical bash+Python and pure-bash examples. + +Use `set -e` early; use `::error::` annotations on regression; exit 0 with one happy-path confirmation line. Take no arguments, require no env vars. The CI loop iterates every `*.sh` without args. + +Write the allowlist file alongside (`-exceptions.yaml`) with the shape `- path: ... / - name: ... + justification + expires`. Make `expires` a required field — every exception has a hard expiration date, typically 90 days out. + +Verify on a deliberately broken state: introduce the regression, confirm the guard fires with a useful message, revert, confirm green. Capture the negative-test output in your PR description. + +Add a row to `scripts/ci-guards/README.md`. The CI loop auto-picks up the new file — no `ci.yml` edit required, unless the guard needs Docker (in which case it gets its own dedicated job; see `cold-db-compose-smoke` for the pattern). + +## Discipline: the allowlist trap + +Allowlists are dangerous. They start as a small concession ("this one env var is documented for an external script, not consumed by Go code") and become a junk drawer of unverified exemptions that mask real defects. The discipline that keeps that from happening: + +Every entry MUST carry a `justification:` field with a one-line reason. "Tech debt" is not a reason; "documented contract surface consumed by the ACME DNS-01 helper script — see `deploy/test/acme/dns01-export.sh`" is. + +Every entry MUST carry an `expires:` field with a hard date, typically 90 days out. The guards reject entries past their expiration. When an entry expires, the only paths forward are (a) close the underlying gap so the entry is no longer needed, (b) re-justify with a fresh expiration. Both force a real review. + +If you're adding more than one entry to an allowlist in a single PR, that's a smell — usually the underlying class needs a small refactor, not three allowlist rows. + +## Where the bundles live + +The `Audit-Closes:` commit trailer convention (post-v2.1.0 anti-rot item 4) is the cross-reference between audit findings and the commits that closed them. Re-derive the closure history of any audit with: + + git log --grep='Audit-Closes: ' + +The audit folder structure under `cowork/` (workspace-local; not in this repo) carries the per-audit RESULTS.md + findings.yaml. CLAUDE.md's "Audit closures" subsection is the current-state index of which audits are open vs closed. + +## Related + +The exhaustive guard list — `scripts/ci-guards/README.md`. +The CI pipeline architecture — `docs/contributor/ci-pipeline.md`. +The QA test suite — `docs/contributor/qa-test-suite.md`. diff --git a/docs/contributor/ci-pipeline.md b/docs/contributor/ci-pipeline.md index 5744bc4..bc4c183 100644 --- a/docs/contributor/ci-pipeline.md +++ b/docs/contributor/ci-pipeline.md @@ -1,6 +1,6 @@ # CI Pipeline — Operator Guide -> Last reviewed: 2026-05-05 +> Last reviewed: 2026-05-12 > Authoritative guide to certctl's CI pipeline shape. > Per the ci-pipeline-cleanup spec, Phase 12. diff --git a/scripts/ci-guards/README.md b/scripts/ci-guards/README.md index 69abeaa..ed22e28 100644 --- a/scripts/ci-guards/README.md +++ b/scripts/ci-guards/README.md @@ -53,7 +53,11 @@ Current helpers: 4. CI auto-picks up new scripts via the `for g in scripts/ci-guards/*.sh` loop in the `Regression guards` step — no ci.yml change required. -## The 22 guards in this directory +## Guards in this directory + +Count: re-derive on demand via `ls scripts/ci-guards/*.sh | wc -l`. The table below names each one — keep it in sync as guards are added. + +### Per-finding regression guards | ID | Finding | Catches | |---|---|---| @@ -80,6 +84,18 @@ Current helpers: | `H-1-encryption-key-min-length` | H-1 closure follow-up (post-Phase-5 surfacing) | `CERTCTL_CONFIG_ENCRYPTION_KEY` literal in any `deploy/docker-compose*.yml` shorter than the 32-byte floor enforced by `internal/config/config.go::Validate()` | | `test-compose-scep-coherence` | post-Phase-5 surfacing of dead SCEP test config | `CERTCTL_SCEP_ENABLED=true` in test compose without (a) a CI job that runs the SCEP integration test, (b) the `ra.crt` + `ra.key` + `intune_trust_anchor.pem` fixtures committed to `deploy/test/fixtures/`, AND (c) the matching volume mount | +### Forward-looking guards (Auditable Codebase Bundle, post-v2.1.0 anti-rot) + +These guards catch defect classes BEFORE they get audit findings — they pin invariants on the codebase that the v2.0 audit history showed are easy to lose. + +| ID | Item | Catches | +|---|---|---| +| `complete-path-config-coverage` | post-v2.1.0 / item-1 | "Lying field" — `CERTCTL_*` env var defined in `internal/config/config.go` that no consumer outside `internal/config/` actually reads. Operator-facing config that the docs claim works but the code never honors. Companion Go test at `internal/config/coverage_test.go`. | +| `doc-rot-detector` | post-v2.1.0 / item-5 | Docs older than 90 days warn (yellow), older than 120 days fail (red). Uses HEAD commit timestamp for reproducibility. `docs/archive/` allowlisted in bulk. | +| `cold-db-compose-smoke` | post-v2.1.0 / item-6 | Migration-on-cold-DB regression (canonical: 2026-05-09 migration 000045 broken INSERT, commit `6444e13`). Wipes the postgres volume, brings the stack up cold, issue/renew/revoke + 3 audit rows. **Runs in its own GitHub Actions job** (`cold-db-compose-smoke`), NOT the generic regression-guards loop — needs Docker. | + +The fourth Bundle artifact (`internal/ciparity/`) is Go tests, not shell guards — runs under the standard Go test step. Pins the MCP tool catalogue floor + naming convention; reports CLI/MCP/OpenAPI surface counts as a trend metric. + ## Guards explicitly NOT here - **`QA-doc Part-count drift`** + **`QA-doc seed-count drift`** — these