Files
certctl/scripts/ci-guards/doc-rot-detector.sh
T
shankar0123 3fe511189f feat(ci): item-5 doc rot detector (90d warn / 120d fail)
scripts/ci-guards/doc-rot-detector.sh — walks every *.md under docs/,
parses the '> Last reviewed: YYYY-MM-DD' blockquote convention
established by the 2026-05-04 docs overhaul, emits:

  - ::warning:: GitHub annotation when a doc is >= 90 days old
    (heads-up; non-blocking).
  - ::error:: + exit 1 when >= 120 days (build-blocking).

Uses HEAD commit timestamp (git log -1 --format=%cs) as 'now' rather
than wall clock — keeps the guard reproducible on a release that's
been on a shelf.

Verified in sandbox:
  - Clean run: 90 docs scanned, 88 dated (2 in docs/archive/
    allowlisted in bulk), 0 missing field, 0 warns, 0 fails.
  - Negative test (backdated docs/README.md to 2025-12-01, 162d):
    fires with '::error::Docs older than 120 days (build-blocking)'
    + three remediation paths listed.

Allowlist at scripts/ci-guards/doc-rot-detector-exceptions.yaml:
  - 'docs/archive/' bulk-allowlisted (intentionally frozen content)
  - Per-doc entries require name + justification + expiration date;
    expired entries fail the guard.

Bootstrap sweep NOT required — baseline survey at branch creation
shows oldest doc is 7 days old (2026-05-05); zero docs over either
threshold today. Forward-looking insurance only.

Audit-Closes: post-v2.1.0-anti-rot/item-5
2026-05-12 14:10:27 +00:00

183 lines
6.3 KiB
Bash
Executable File

#!/usr/bin/env bash
# scripts/ci-guards/doc-rot-detector.sh
#
# Per post-v2.1.0 anti-rot item 5 (Auditable Codebase Bundle).
#
# Walks every *.md under docs/ and parses the "> Last reviewed:
# YYYY-MM-DD" blockquote line (the convention established by the
# 2026-05-04 docs overhaul — every doc carries one). Emits:
#
# - ::warning:: GitHub annotation (yellow, non-blocking) when a doc
# is older than 90 days vs HEAD's commit timestamp.
# - ::error:: GitHub annotation + exit 1 when a doc is older than
# 120 days.
#
# Uses HEAD's commit timestamp (git log -1 --format=%ai HEAD) as "now"
# rather than wall-clock — keeps the guard reproducible on a release
# that's been on a shelf. A 2-year-old commit verified today should
# fail the same docs it failed back then, not new ones.
#
# Allowlist: scripts/ci-guards/doc-rot-detector-exceptions.yaml
# (every entry carries a one-line justification + an expiration date).
# docs/archive/** is allowlisted in bulk by directory; it's
# intentionally frozen historical content and shouldn't keep getting
# reviewed.
set -e
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
EXCEPTIONS_FILE="${REPO_ROOT}/scripts/ci-guards/doc-rot-detector-exceptions.yaml"
WARN_DAYS="${CERTCTL_DOC_ROT_WARN_DAYS:-90}"
FAIL_DAYS="${CERTCTL_DOC_ROT_FAIL_DAYS:-120}"
cd "$REPO_ROOT"
# "Now" = the commit timestamp of HEAD, in YYYY-MM-DD form. Falls back
# to the wall clock if git fails (e.g., guard run outside a repo).
NOW_DATE="$(git -C "$REPO_ROOT" log -1 --format=%cs HEAD 2>/dev/null || date -u +%Y-%m-%d)"
python3 - "$REPO_ROOT" "$EXCEPTIONS_FILE" "$NOW_DATE" "$WARN_DAYS" "$FAIL_DAYS" <<'PY'
import os, sys, datetime, pathlib, re
repo_root = pathlib.Path(sys.argv[1])
exceptions_path = pathlib.Path(sys.argv[2])
now_str = sys.argv[3]
warn_days = int(sys.argv[4])
fail_days = int(sys.argv[5])
try:
now = datetime.date.fromisoformat(now_str)
except Exception:
sys.stderr.write(f"could not parse now={now_str!r}\n")
sys.exit(2)
# Load allowlist. Same tiny YAML reader the other guards use.
allowlist_paths = set()
per_doc = {}
if exceptions_path.exists():
txt = exceptions_path.read_text()
cur = None
for raw in txt.splitlines():
line = raw.rstrip()
if not line.strip() or line.lstrip().startswith("#"):
continue
if line.lstrip().startswith("- path:"):
cur = {"path": line.split(":", 1)[1].strip().strip('"').strip("'")}
# entries can be a directory (path ends with /) or a single file
if cur["path"].endswith("/"):
allowlist_paths.add(cur["path"])
else:
per_doc[cur["path"]] = cur
continue
if cur is not None and line.startswith(" "):
if ":" not in line:
continue
k, v = line.split(":", 1)
cur[k.strip()] = v.strip().strip('"').strip("'")
LAST_REVIEWED_RE = re.compile(r"^>\s*Last reviewed:\s*(\d{4}-\d{2}-\d{2})\s*$", re.MULTILINE)
docs_root = repo_root / "docs"
if not docs_root.exists():
sys.stderr.write("docs/ not found — nothing to check\n")
sys.exit(0)
# Collect every doc file.
docs = []
for fp in docs_root.rglob("*.md"):
rel = fp.relative_to(repo_root).as_posix()
docs.append((rel, fp))
def is_in_allowlisted_dir(rel: str) -> bool:
for prefix in allowlist_paths:
if rel.startswith(prefix):
return True
return False
def per_doc_active(rel: str) -> (bool, str):
if rel not in per_doc:
return False, ""
e = per_doc[rel]
exp = e.get("expires")
just = e.get("justification", "")
if not exp:
return False, "allowlist entry missing 'expires:'"
try:
ed = datetime.date.fromisoformat(exp)
except Exception:
return False, f"allowlist entry has malformed expires: {exp!r}"
if ed < now:
return False, f"allowlist entry expired on {exp}"
if not just:
return False, "allowlist entry has no justification"
return True, f"allowlisted until {exp}: {just}"
warn_rows = []
fail_rows = []
missing_field_rows = []
skipped = 0
total_checked = 0
for rel, fp in sorted(docs):
if is_in_allowlisted_dir(rel):
skipped += 1
continue
ok, msg = per_doc_active(rel)
if ok:
skipped += 1
continue
body = fp.read_text(errors="ignore")
m = LAST_REVIEWED_RE.search(body)
if not m:
missing_field_rows.append(rel)
continue
try:
reviewed = datetime.date.fromisoformat(m.group(1))
except Exception:
missing_field_rows.append(rel + f" (unparseable date {m.group(1)!r})")
continue
total_checked += 1
age = (now - reviewed).days
if age >= fail_days:
fail_rows.append((rel, reviewed.isoformat(), age))
elif age >= warn_days:
warn_rows.append((rel, reviewed.isoformat(), age))
print(f"doc-rot-detector — now={now.isoformat()} warn≥{warn_days}d fail≥{fail_days}d")
print(f" total docs scanned: {len(docs)}, allowlisted: {skipped}, dated: {total_checked}, missing date field: {len(missing_field_rows)}")
print()
if missing_field_rows:
print("::warning::Docs missing or unparseable '> Last reviewed: YYYY-MM-DD' line:")
for r in missing_field_rows:
print(f" - {r}")
print()
print(" Add the convention line near the top of each doc, e.g.:")
print(' > Last reviewed: 2026-MM-DD')
if warn_rows:
print(f"::warning::Docs older than {warn_days} days (heads-up, non-blocking):")
for rel, d, age in warn_rows:
print(f" - {rel}: reviewed {d} ({age}d ago)")
print()
if fail_rows:
print(f"::error::Docs older than {fail_days} days (build-blocking):")
for rel, d, age in fail_rows:
print(f" - {rel}: reviewed {d} ({age}d ago)")
print()
print(" Fix options:")
print(" 1. Re-read the doc against the repo, fix any drift, bump '> Last reviewed:' to today.")
print(" 2. If the doc is intentionally frozen, move it under docs/archive/ (allowlisted in bulk).")
print(f" 3. Add a per-doc allowlist row to {exceptions_path.relative_to(repo_root)} with a justification + expiration.")
sys.exit(1)
# Missing-date-field counts as a hard fail too — the convention is
# load-bearing.
if missing_field_rows:
sys.exit(1)
print("OK — every doc under docs/ has a recent '> Last reviewed:' date.")
PY