mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 22:21:30 +00:00
1383fe419b
The Phase 2 commit's CI run (2026-05-13T19:50 against 69a2b5c) failed
on digest-validity.sh with HTTP 429 from ghcr.io while resolving the
lscr.io/linuxserver/openssh-server digest. ghcr.io rate-limits
unauthenticated manifest HEAD requests aggressively; the existing
guard had no retry, so a single 429 failed the whole CI gate.
Fix: retry on 429 / 502 / 503 / 504 with exponential backoff (2s,
4s, 8s; max 3 retries per ref). Non-retryable errors (400, 401, 403,
404, 5xx that aren't gateway-class) still fail fast — we only retry
on the transient-rate-limit + gateway-blip class. Each retry logs
the attempt count so a future operator investigating an outage can
see how many attempts happened before the final verdict.
The local re-run after the fix shows all 15 verifiable digests
resolve cleanly (no retries were needed on this particular run — the
429 was transient, as expected).
Not a Phase-1/2/3 regression; this is a pre-existing fragility in a
guard that's been in place since ci-pipeline-cleanup Phase 7. The
fix lands as a small follow-on to Phase 3 because the prompt's
recommended ratchet is 'CI guards should be reliable enough to gate
the build, or they should be advisory.'
186 lines
6.9 KiB
Bash
Executable File
186 lines
6.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# scripts/ci-guards/digest-validity.sh
|
|
#
|
|
# Verify every @sha256:<digest> reference in deploy/**/*.{yml,Dockerfile*}
|
|
# actually resolves on its registry. H-001 only checks for digest
|
|
# presence; this catches fabricated or stale digests.
|
|
#
|
|
# Per ci-pipeline-cleanup bundle Phase 7. The bug class this catches:
|
|
# Bundle II shipped 11 fabricated digests that passed H-001's
|
|
# regex-only check and failed `docker pull` in CI.
|
|
#
|
|
# Real registries supported:
|
|
# - Docker Hub library/* and non-library (auth.docker.io)
|
|
# - ghcr.io (lscr.io alias for linuxserver/*)
|
|
# - mcr.microsoft.com (no auth required for public images;
|
|
# Windows IIS image needs the manifest.v2 single-image digest,
|
|
# not the multi-arch list digest)
|
|
|
|
set -e
|
|
|
|
# Find every digest reference in compose files + Dockerfiles
|
|
mapfile -t REFS < <(
|
|
grep -rEho '[a-z0-9./-]+:[a-z0-9.-]+@sha256:[a-f0-9]{64}' \
|
|
deploy/ Dockerfile* deploy/test/*/Dockerfile 2>/dev/null \
|
|
| sort -u
|
|
)
|
|
|
|
if [ ${#REFS[@]} -eq 0 ]; then
|
|
echo "No @sha256 refs found — nothing to verify."
|
|
exit 0
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Excluded refs — digests for images CI never pulls.
|
|
# ---------------------------------------------------------------------------
|
|
# The guard's purpose is "every digest CI actually depends on is valid."
|
|
# Images that exist in compose only as documentation for an operator's
|
|
# manual workflow (e.g., Windows containers we cannot start on Linux
|
|
# runners) shouldn't add CI brittleness against external-registry
|
|
# rate-limiting we don't control.
|
|
#
|
|
# Each entry below is a substring matched against the full ref line
|
|
# (`<image>:<tag>@sha256:<digest>`). When a ref matches, it is logged as
|
|
# `SKIP (excluded)` and the loop continues. The match is by image-path
|
|
# substring, not by digest, so a future tag/digest update still excludes
|
|
# the right image without needing this list to be re-edited.
|
|
#
|
|
# Add an entry only with a documented reason in the comment block above
|
|
# the entry. This list is NOT a place to silence transient flakes — those
|
|
# get fixed by retries in the script itself, not by exclusion.
|
|
EXCLUDED_PATTERNS=(
|
|
# mcr.microsoft.com/windows/servercore/iis
|
|
# Windows-only image gated behind compose profiles=[deploy-e2e-windows]
|
|
# (deploy/docker-compose.test.yml:700). Linux CI runners cannot start
|
|
# the windows-iis-test sidecar — the entire Windows matrix was deleted
|
|
# per ci-pipeline-cleanup Phase 6 / frozen decision 0.5, and IIS
|
|
# validation moved to docs/connector-iis.md::Operator validation
|
|
# playbook. All 10 TestVendorEdge_IIS_*_E2E tests are on
|
|
# scripts/vendor-e2e-skip-allowlist.txt for the same reason.
|
|
#
|
|
# Without this exclusion, Linux CI runners HEAD this digest from MCR
|
|
# on every push. MCR rate-limits unauthenticated requests by source IP;
|
|
# GitHub-hosted runner IPs are heavily reused across users; the result
|
|
# is ~one transient 4xx/5xx every N runs (CI run #376 hit it). Re-runs
|
|
# pass because runner IPs rotate. The image itself is fine — we just
|
|
# don't need Linux CI to verify it.
|
|
"mcr.microsoft.com/windows/servercore/iis"
|
|
)
|
|
|
|
fail=0
|
|
verified=0
|
|
skipped=0
|
|
for ref in "${REFS[@]}"; do
|
|
# Apply exclusion list before any work on the ref.
|
|
excluded=0
|
|
for pat in "${EXCLUDED_PATTERNS[@]}"; do
|
|
if [[ "$ref" == *"$pat"* ]]; then
|
|
echo "SKIP (excluded) $ref"
|
|
excluded=1
|
|
skipped=$((skipped + 1))
|
|
break
|
|
fi
|
|
done
|
|
if [ "$excluded" -eq 1 ]; then
|
|
continue
|
|
fi
|
|
|
|
digest="${ref##*@}"
|
|
imgtag="${ref%@*}"
|
|
tag="${imgtag##*:}"
|
|
img="${imgtag%:*}"
|
|
|
|
# Determine registry + auth flow.
|
|
if [[ "$img" =~ ^lscr\.io/ ]]; then
|
|
img="${img#lscr.io/}"
|
|
registry="ghcr.io"
|
|
auth_url="https://ghcr.io/token?scope=repository:${img}:pull"
|
|
elif [[ "$img" =~ ^mcr\.microsoft\.com/ ]]; then
|
|
img="${img#mcr.microsoft.com/}"
|
|
registry="mcr.microsoft.com"
|
|
auth_url=""
|
|
elif [[ "$img" == */* ]]; then
|
|
# Non-library Docker Hub (e.g., envoyproxy/envoy, boky/postfix)
|
|
registry="registry-1.docker.io"
|
|
auth_url="https://auth.docker.io/token?service=registry.docker.io&scope=repository:${img}:pull"
|
|
else
|
|
# Library Docker Hub (e.g., httpd, golang)
|
|
img="library/$img"
|
|
registry="registry-1.docker.io"
|
|
auth_url="https://auth.docker.io/token?service=registry.docker.io&scope=repository:${img}:pull"
|
|
fi
|
|
|
|
# Get auth token if needed.
|
|
auth_header=""
|
|
if [ -n "$auth_url" ]; then
|
|
tok=$(curl -sS "$auth_url" | python3 -c "import sys,json; print(json.load(sys.stdin)['token'])" 2>/dev/null)
|
|
if [ -z "$tok" ]; then
|
|
echo "::error::Failed to get auth token for $registry / $img"
|
|
fail=1
|
|
continue
|
|
fi
|
|
auth_header="Authorization: Bearer $tok"
|
|
fi
|
|
|
|
# HEAD the manifest by digest, with exponential-backoff retry on
|
|
# transient registry errors (HTTP 429 rate-limit, 502/503/504 gateway
|
|
# blips). ghcr.io aggressively rate-limits unauthenticated HEAD
|
|
# requests against the linuxserver/* namespace; pre-2026-05-13 this
|
|
# caused intermittent CI failures on the Phase 2 commit's run
|
|
# (workflow log lscr.io/linuxserver/openssh-server → ghcr.io 429).
|
|
# Backoff schedule: 2s → 4s → 8s, max 3 retries per ref.
|
|
build_curl_args() {
|
|
if [ -n "$auth_header" ]; then
|
|
echo "-H|$auth_header"
|
|
fi
|
|
echo "-H|Accept: application/vnd.oci.image.index.v1+json"
|
|
echo "-H|Accept: application/vnd.docker.distribution.manifest.list.v2+json"
|
|
echo "-H|Accept: application/vnd.oci.image.manifest.v1+json"
|
|
echo "-H|Accept: application/vnd.docker.distribution.manifest.v2+json"
|
|
}
|
|
attempt=0
|
|
max_attempts=3
|
|
code="000"
|
|
while [ "$attempt" -lt "$max_attempts" ]; do
|
|
mapfile -t cargs < <(build_curl_args)
|
|
cargs_expanded=()
|
|
for arg in "${cargs[@]}"; do
|
|
IFS='|' read -r flag val <<< "$arg"
|
|
cargs_expanded+=("$flag" "$val")
|
|
done
|
|
code=$(curl -sS -o /dev/null -w "%{http_code}" \
|
|
"${cargs_expanded[@]}" \
|
|
"https://${registry}/v2/${img}/manifests/${digest}")
|
|
# Success or non-retryable failure → stop.
|
|
if [ "$code" = "200" ] || ! [[ "$code" =~ ^(429|502|503|504)$ ]]; then
|
|
break
|
|
fi
|
|
attempt=$((attempt + 1))
|
|
if [ "$attempt" -lt "$max_attempts" ]; then
|
|
sleep_secs=$((2 ** attempt))
|
|
echo " retry $attempt/$max_attempts after HTTP $code on $ref (sleep ${sleep_secs}s)"
|
|
sleep "$sleep_secs"
|
|
fi
|
|
done
|
|
|
|
if [ "$code" != "200" ]; then
|
|
echo "::error::digest does not resolve: ${ref}"
|
|
echo " registry: $registry"
|
|
echo " image: $img"
|
|
echo " digest: $digest"
|
|
echo " HTTP: $code (retried $attempt time(s) for 429/502/503/504)"
|
|
fail=1
|
|
else
|
|
echo "OK $ref"
|
|
verified=$((verified + 1))
|
|
fi
|
|
done
|
|
|
|
[ $fail -eq 0 ] || exit 1
|
|
echo ""
|
|
if [ "$skipped" -gt 0 ]; then
|
|
echo "digest-validity: clean — ${verified} verified, ${skipped} excluded (CI never pulls)."
|
|
else
|
|
echo "digest-validity: clean — all ${verified} digest references resolve."
|
|
fi
|