Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
5aab5eb
fix(ci): extend gke image GC time to 8h
davdhacs Feb 26, 2026
fb97161
increase GKE CI node disk to 100GB
davdhacs Feb 26, 2026
dcafbe4
Revert "fix(ci): extend gke image GC time to 8h"
davdhacs Feb 26, 2026
9fb8f60
fix(ROX-33305): remove digest pinning from TEST_IMAGE
davdhacs Feb 26, 2026
3a8ab48
fix(ROX-33305): pin prefetched images to prevent kubelet GC
davdhacs Feb 27, 2026
a83f8ca
fix(ROX-33305): fix containerd-ctr install in image pinning
davdhacs Feb 27, 2026
4eed237
fix(ROX-33305): use containerd image instead of installing ctr at run…
davdhacs Feb 27, 2026
b403585
fix(ROX-33305): use host's ctr via nsenter for image pinning
davdhacs Feb 27, 2026
e331c2f
fix(ROX-33305): use kubectl debug node for image pinning
davdhacs Feb 28, 2026
80b79f8
fix(ROX-33305): pin all containerd images, not just prefetch list tags
davdhacs Feb 28, 2026
93bf074
fix(ROX-33305): remove -it flag from kubectl debug for CI
davdhacs Feb 28, 2026
172af32
fix(ROX-33305): use kubectl run + nsenter for reliable image pinning
davdhacs Feb 28, 2026
18214d4
empty commit: warm run to verify v6 mtime-fixed cache
davdhacs Mar 1, 2026
843a1b5
fix(ROX-33305): use kubectl apply manifest for pin pods
davdhacs Mar 1, 2026
3092616
fix(ROX-33305): re-pull GC'd images instead of pinning
davdhacs Mar 2, 2026
2d18d1e
fix(ROX-33305): use image-prefetcher branch with GC prevention
davdhacs Mar 3, 2026
c3dadb4
Merge branch 'master' into davdhacs/rox-33305-gc-8h
davdhacs Mar 4, 2026
f86e2bf
restore test_image @sha
davdhacs Mar 4, 2026
8961197
try larger machine and 200gb disk
davdhacs Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/ci/gke.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ create_cluster() {
GCP_IMAGE_TYPE="${GCP_IMAGE_TYPE:-UBUNTU_CONTAINERD}"
POD_SECURITY_POLICIES="${POD_SECURITY_POLICIES:-false}"
GKE_RELEASE_CHANNEL="${GKE_RELEASE_CHANNEL:-stable}"
MACHINE_TYPE="${MACHINE_TYPE:-e2-standard-4}"
DISK_SIZE_GB=${DISK_SIZE_GB:-80}
MACHINE_TYPE="${MACHINE_TYPE:-e2-standard-8}"
DISK_SIZE_GB=${DISK_SIZE_GB:-200}

echo "Creating ${NUM_NODES} node cluster with image type \"${GCP_IMAGE_TYPE}\" and ${DISK_SIZE_GB}GB disks."

Expand Down
111 changes: 110 additions & 1 deletion scripts/ci/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,9 @@ image_prefetcher_start_set() {
local image_prefetcher_deploy_bin
image_prefetcher_deploy_bin="$(make print-image-prefetcher-deploy-bin)"
local image_prefetcher_version
image_prefetcher_version="$(go -C tools/test list -m -f '{{.Version}}' github.com/stackrox/image-prefetcher/deploy)"
# TODO(ROX-33305): Remove override once image-prefetcher with GC prevention is released.
# Using branch build that pins images after CRI pull to prevent kubelet GC.
image_prefetcher_version="${IMAGE_PREFETCHER_VERSION:-branch-rox-33305-prevent-gc}"

info "Using ${image_prefetcher_deploy_bin} ${image_prefetcher_version} for image prefetch deployment"

Expand Down Expand Up @@ -801,6 +803,112 @@ _image_prefetcher_system_await() {
esac
}

_image_prefetcher_repull_gc_victims() {
local ns="$1"
local name="$2"

info "Re-pulling any GC'd prefetched images..."

# Kubelet image GC can evict images during or after the prefetch,
# removing tag references from containerd. Re-pulling via ctr is
# near-instant when layers are cached and re-establishes the tag
# references. io.cri-containerd.pinned doesn't reliably prevent
# GC (containerd#9328, #10270), so re-pull is the only reliable fix.
local nodes
nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}')

# Script body in single quotes to prevent any variable expansion by bash.
# This runs inside the container: copies files to host via /proc/1/root,
# then nsenter runs the host's ctr to re-pull missing images.
local script='cp /etc/prefetch-images/images.txt /proc/1/root/tmp/repull-images.txt
cp /tmp/pull-secret/.dockerconfigjson /proc/1/root/tmp/repull-docker-config.json
nsenter -t 1 -m -u -n -p -- sh -c '"'"'
mkdir -p /root/.docker
cp /tmp/repull-docker-config.json /root/.docker/config.json
repulled=0; skipped=0; failed=0
while IFS= read -r img || [ -n "$img" ]; do
case "$img" in "#"*|"") continue ;; esac
if ctr -n k8s.io images check "name==$img" 2>/dev/null | grep -q "$img"; then
skipped=$((skipped+1))
else
if ctr -n k8s.io images pull "$img" >/dev/null 2>&1; then
repulled=$((repulled+1))
else
echo "FAIL: $img"
failed=$((failed+1))
fi
fi
done < /tmp/repull-images.txt
rm -f /tmp/repull-images.txt /tmp/repull-docker-config.json /root/.docker/config.json
echo "repulled=$repulled skipped=$skipped failed=$failed"
'"'"''

# Write script to a temp file for use in the configmap
local script_file
script_file=$(mktemp)
echo "$script" > "$script_file"
kubectl create configmap repull-script -n "$ns" \
--from-file=repull.sh="$script_file" \
--dry-run=client -o yaml | kubectl apply -f -
rm -f "$script_file"

for node in $nodes; do
local pod_name="repull-${node##*-}"
cat <<REPULLEOF | kubectl apply -n "$ns" -f -
apiVersion: v1
kind: Pod
metadata:
name: ${pod_name}
namespace: ${ns}
spec:
nodeName: ${node}
hostPID: true
restartPolicy: Never
imagePullSecrets:
- name: stackrox
containers:
- name: repull
image: quay.io/rhacs-eng/qa-multi-arch-busybox:latest
imagePullPolicy: IfNotPresent
command: ["sh", "/etc/repull-script/repull.sh"]
securityContext:
privileged: true
volumeMounts:
- name: image-list
mountPath: /etc/prefetch-images
readOnly: true
- name: pull-secret
mountPath: /tmp/pull-secret
readOnly: true
- name: repull-script
mountPath: /etc/repull-script
readOnly: true
volumes:
- name: image-list
configMap:
name: ${name}
- name: pull-secret
secret:
secretName: stackrox
- name: repull-script
configMap:
name: repull-script
REPULLEOF
done

for node in $nodes; do
local pod_name="repull-${node##*-}"
if kubectl wait "pod/$pod_name" -n "$ns" --for=jsonpath='{.status.phase}'=Succeeded --timeout=5m 2>/dev/null; then
info " Node ${node}: $(kubectl logs "$pod_name" -n "$ns" 2>/dev/null)"
else
info " Node ${node}: repull pod did not succeed"
kubectl logs "$pod_name" -n "$ns" 2>/dev/null | tail -5 || true
fi
kubectl delete pod "$pod_name" -n "$ns" --ignore-not-found >/dev/null 2>&1 || true
done
info "Image re-pull complete on all nodes."
}

image_prefetcher_await_set() {
local ns="prefetch-images"
local name="$1"
Expand All @@ -809,6 +917,7 @@ image_prefetcher_await_set() {
info "Waiting for image prefetcher set ${name} to complete..."
if kubectl rollout status daemonset "$name" -n "$ns" --timeout 15m; then
info "All images in the set are now pre-fetched."
# Image pinning is now handled by the image-prefetcher itself (branch-rox-33305-prevent-gc).
else
info "WARNING: Pre-fetching failed to complete in time."
info "To investigate closer, go to https://console.cloud.google.com/bigquery and run a query such as:"
Expand Down
Loading