From 1383fe419b8c7ee317144809c78e4830bc3cddbb Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Wed, 13 May 2026 20:17:08 +0000 Subject: [PATCH] ci: add exponential-backoff retry to digest-validity guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Phase 2 commit's CI run (2026-05-13T19:50 against 69a2b5c) failed on digest-validity.sh with HTTP 429 from ghcr.io while resolving the lscr.io/linuxserver/openssh-server digest. ghcr.io rate-limits unauthenticated manifest HEAD requests aggressively; the existing guard had no retry, so a single 429 failed the whole CI gate. Fix: retry on 429 / 502 / 503 / 504 with exponential backoff (2s, 4s, 8s; max 3 retries per ref). Non-retryable errors (400, 401, 403, 404, 5xx that aren't gateway-class) still fail fast — we only retry on the transient-rate-limit + gateway-blip class. Each retry logs the attempt count so a future operator investigating an outage can see how many attempts happened before the final verdict. The local re-run after the fix shows all 15 verifiable digests resolve cleanly (no retries were needed on this particular run — the 429 was transient, as expected). Not a Phase-1/2/3 regression; this is a pre-existing fragility in a guard that's been in place since ci-pipeline-cleanup Phase 7. The fix lands as a small follow-on to Phase 3 because the prompt's recommended ratchet is 'CI guards should be reliable enough to gate the build, or they should be advisory.' --- scripts/ci-guards/digest-validity.sh | 55 ++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/scripts/ci-guards/digest-validity.sh b/scripts/ci-guards/digest-validity.sh index b83fab2..d1c0928 100755 --- a/scripts/ci-guards/digest-validity.sh +++ b/scripts/ci-guards/digest-validity.sh @@ -122,30 +122,53 @@ for ref in "${REFS[@]}"; do auth_header="Authorization: Bearer $tok" fi - # HEAD the manifest by digest. - if [ -n "$auth_header" ]; then + # HEAD the manifest by digest, with exponential-backoff retry on + # transient registry errors (HTTP 429 rate-limit, 502/503/504 gateway + # blips). ghcr.io aggressively rate-limits unauthenticated HEAD + # requests against the linuxserver/* namespace; pre-2026-05-13 this + # caused intermittent CI failures on the Phase 2 commit's run + # (workflow log lscr.io/linuxserver/openssh-server → ghcr.io 429). + # Backoff schedule: 2s → 4s → 8s, max 3 retries per ref. + build_curl_args() { + if [ -n "$auth_header" ]; then + echo "-H|$auth_header" + fi + echo "-H|Accept: application/vnd.oci.image.index.v1+json" + echo "-H|Accept: application/vnd.docker.distribution.manifest.list.v2+json" + echo "-H|Accept: application/vnd.oci.image.manifest.v1+json" + echo "-H|Accept: application/vnd.docker.distribution.manifest.v2+json" + } + attempt=0 + max_attempts=3 + code="000" + while [ "$attempt" -lt "$max_attempts" ]; do + mapfile -t cargs < <(build_curl_args) + cargs_expanded=() + for arg in "${cargs[@]}"; do + IFS='|' read -r flag val <<< "$arg" + cargs_expanded+=("$flag" "$val") + done code=$(curl -sS -o /dev/null -w "%{http_code}" \ - -H "$auth_header" \ - -H "Accept: application/vnd.oci.image.index.v1+json" \ - -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ - -H "Accept: application/vnd.oci.image.manifest.v1+json" \ - -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + "${cargs_expanded[@]}" \ "https://${registry}/v2/${img}/manifests/${digest}") - else - code=$(curl -sS -o /dev/null -w "%{http_code}" \ - -H "Accept: application/vnd.oci.image.index.v1+json" \ - -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ - -H "Accept: application/vnd.oci.image.manifest.v1+json" \ - -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ - "https://${registry}/v2/${img}/manifests/${digest}") - fi + # Success or non-retryable failure → stop. + if [ "$code" = "200" ] || ! [[ "$code" =~ ^(429|502|503|504)$ ]]; then + break + fi + attempt=$((attempt + 1)) + if [ "$attempt" -lt "$max_attempts" ]; then + sleep_secs=$((2 ** attempt)) + echo " retry $attempt/$max_attempts after HTTP $code on $ref (sleep ${sleep_secs}s)" + sleep "$sleep_secs" + fi + done if [ "$code" != "200" ]; then echo "::error::digest does not resolve: ${ref}" echo " registry: $registry" echo " image: $img" echo " digest: $digest" - echo " HTTP: $code" + echo " HTTP: $code (retried $attempt time(s) for 429/502/503/504)" fail=1 else echo "OK $ref"