From c8995c4fb36dddd63ad2b3f8ec534125d35eb037 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 10 Jun 2026 18:32:36 -0700 Subject: [PATCH] ci: retry NVHPC image pull on transient nvcr.io timeouts The bare 'docker pull' in the NVHPC jobs fails the whole job when nvcr.io returns 'context deadline exceeded', which happens sporadically with ~30 matrix jobs pulling concurrently. Retry up to 5 times with linear backoff; pulls resume completed layers so retries are cheap. --- .github/workflows/test.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5fe8cda475..c8e4cc61fa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -156,9 +156,19 @@ jobs: # Replaces the container: directive so we can free disk space first. # Uses "docker run -d ... sleep infinity" + "docker exec" to preserve # installed packages and env vars across steps. + # Retry the pull: nvcr.io intermittently times out ("context deadline + # exceeded") under load, and ~30 matrix jobs hit it at once. Pulls + # resume completed layers, so retries are cheap. - name: Pull NVHPC container if: matrix.nvhpc - run: docker pull "$NVHPC_IMAGE" + run: | + for attempt in 1 2 3 4 5; do + docker pull "$NVHPC_IMAGE" && exit 0 + echo "docker pull failed (attempt $attempt/5); retrying in $((attempt * 30))s..." + sleep $((attempt * 30)) + done + echo "::error::Failed to pull $NVHPC_IMAGE after 5 attempts" + exit 1 - name: Start NVHPC container if: matrix.nvhpc