Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pkg/testutils/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func (r retryT) Logf(format string, args ...interface{}) {
r.t.Logf(format, args...)
}

func runRetry(testFn func(t T)) (success bool) {
func runRetry(t T, testFn func(t T)) (success bool) {
defer func() {
if success {
return
Expand All @@ -53,7 +53,7 @@ func runRetry(testFn func(t T)) (success bool) {
}
}()

testFn(retryT{})
testFn(retryT{t: t})
success = true

return
Expand All @@ -63,7 +63,7 @@ func runRetry(testFn func(t T)) (success bool) {
func Retry(t T, times int, sleepInterval time.Duration, testFn func(t T)) {
for i := 0; i < times-1; i++ {
log.Infof("Test attempt: %d", i)
if runRetry(testFn) {
if runRetry(t, testFn) {
return
}
time.Sleep(sleepInterval)
Expand Down
71 changes: 67 additions & 4 deletions tests/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,15 +277,78 @@ func setImage(t *testing.T, deploymentName string, deploymentID string, containe
}, "image updated", time.Minute, 5*time.Second)
}

func createPod(t testutils.T, client kubernetes.Interface, pod *coreV1.Pod) {
// ensurePodExists creates a pod in Kubernetes. If the pod already exists, this is a no-op.
// This makes the function idempotent and safe to retry.
func ensurePodExists(t testutils.T, client kubernetes.Interface, pod *coreV1.Pod) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

log.Infof("Creating pod %s %s", pod.GetNamespace(), pod.GetName())
t.Logf("Ensuring pod %s %s exists", pod.GetNamespace(), pod.GetName())
_, err := client.CoreV1().Pods(pod.GetNamespace()).Create(ctx, pod, metaV1.CreateOptions{})
require.NoError(t, err)
if err != nil && !apiErrors.IsAlreadyExists(err) {
require.NoError(t, err)
}
if apiErrors.IsAlreadyExists(err) {
t.Logf("Pod %s already exists, continuing", pod.GetName())
}
}

// waitForPodRunning waits for a Kubernetes pod to be in Running phase with all containers ready.
// It polls the pod status with retries and provides detailed error messages about pod and container states.
// Timeout is set to 3 minutes to handle slow CI environments (image pull, scheduling, etc.).
func waitForPodRunning(t testutils.T, client kubernetes.Interface, podNamespace, podName string) *coreV1.Pod {
// Increased timeout to 3 minutes to handle slow CI environments (image pull, scheduling, etc.)
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()

var k8sPod *coreV1.Pod
// Increased from 30×2s (60s) to 60×3s (180s) to account for slower pod startup in CI
testutils.Retry(t, 60, 3*time.Second, func(waitT testutils.T) {
var err error
k8sPod, err = client.CoreV1().Pods(podNamespace).Get(ctx, podName, metaV1.GetOptions{})
require.NoError(waitT, err, "failed to get pod %s", podName)

// Log pod and container status for debugging
// Note: ImagePullBackOff, ErrImagePull, etc. appear in container status, not pod status
logMsg := fmt.Sprintf("Pod phase: %s, Reason: %q, Message: %q",
k8sPod.Status.Phase, k8sPod.Status.Reason, k8sPod.Status.Message)
var containerInfo strings.Builder
for _, status := range k8sPod.Status.ContainerStatuses {
// Build log message for non-ready containers
if !status.Ready {
if status.State.Waiting != nil {
logMsg += fmt.Sprintf(", Container %q: %q", status.Name, status.State.Waiting.Reason)
} else if status.State.Terminated != nil {
logMsg += fmt.Sprintf(", Container %q: Terminated (%q)", status.Name, status.State.Terminated.Reason)
}
}
// Build detailed info for error message (always, in case pod is not running)
containerInfo.WriteString(fmt.Sprintf("\n - %s: ready=%v, started=%v",
status.Name, status.Ready, status.Started != nil && *status.Started))
if status.State.Waiting != nil {
containerInfo.WriteString(fmt.Sprintf(", waiting: %s - %s",
status.State.Waiting.Reason, status.State.Waiting.Message))
}
}
waitT.Logf(logMsg)

// Provide detailed error message if pod is not running
if k8sPod.Status.Phase != coreV1.PodRunning {
require.Failf(waitT, "pod not in Running phase",
"Pod %s is in %s phase (expected Running)\nContainers:%s\nPod Reason: %s\nPod Message: %s",
podName, k8sPod.Status.Phase, containerInfo.String(),
k8sPod.Status.Reason, k8sPod.Status.Message)
}

// Ensure all containers are ready before checking for process events
for _, status := range k8sPod.Status.ContainerStatuses {
require.True(waitT, status.Ready, "container %s not ready (state: %+v)",
status.Name, status.State)
}
})

waitForDeployment(t, pod.GetName())
t.Logf("Pod %s is running with all containers ready in Kubernetes", k8sPod.Name)
return k8sPod
}

func teardownPod(t testutils.T, client kubernetes.Interface, pod *coreV1.Pod) {
Expand Down
98 changes: 52 additions & 46 deletions tests/container_instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,52 +21,58 @@ type ContainerNameGroup struct {
}

func TestContainerInstances(testT *testing.T) {
testT.Skip("Flaky: https://issues.redhat.com/browse/ROX-30400")
// https://stack-rox.atlassian.net/browse/ROX-6493
// - the process events expected in this test are not reliably detected.
kPod := getPodFromFile(testT, "yamls/multi-container-pod.yaml")
client := createK8sClient(testT)
testutils.Retry(testT, 3, 5*time.Second, func(retryT testutils.T) {
// Set up testing environment
defer teardownPod(testT, client, kPod)
createPod(testT, client, kPod)

// Get the test pod.
deploymentID := getDeploymentID(retryT, kPod.GetName())
pods := getPods(retryT, deploymentID)
require.Len(retryT, pods, 1)
pod := pods[0]

// Retry to ensure all processes start up.
testutils.Retry(retryT, 20, 4*time.Second, func(retryEventsT testutils.T) {
// Get the container groups.
groupedContainers := getGroupedContainerInstances(retryEventsT, string(pod.ID))

// Verify the number of containers.
require.Len(retryEventsT, groupedContainers, 2)
// Verify default sort is by name.
names := sliceutils.Map(groupedContainers, func(g ContainerNameGroup) string { return g.Name })
require.Equal(retryEventsT, names, []string{"1st", "2nd"})
// Verify the events.
// Expecting 1 process: nginx
require.Len(retryEventsT, groupedContainers[0].Events, 1)
firstContainerEvents :=
sliceutils.Map(groupedContainers[0].Events, func(event Event) string { return event.Name })
require.ElementsMatch(retryEventsT, firstContainerEvents, []string{"/usr/sbin/nginx"})
// Expecting 3 processes: sh, date, sleep
require.Len(retryEventsT, groupedContainers[1].Events, 3)
secondContainerEvents :=
sliceutils.Map(groupedContainers[1].Events, func(event Event) string { return event.Name })
require.ElementsMatch(retryEventsT, secondContainerEvents, []string{"/bin/sh", "/bin/date", "/bin/sleep"})

// Verify the container group's timestamp is no later than the timestamp of the first event
require.False(retryEventsT, groupedContainers[0].StartTime.After(groupedContainers[0].Events[0].Timestamp.Time))
require.False(retryEventsT, groupedContainers[1].StartTime.After(groupedContainers[1].Events[0].Timestamp.Time))

// Number of events expected should be the aggregate of the above

verifyRiskEventTimelineCSV(retryEventsT, deploymentID, append(firstContainerEvents, secondContainerEvents...))
})
// TODO(ROX-31331): Collector cannot reliably detect all processes in this test's images.
skipIfNoCollection(testT)

_, deploymentID, pod, cleanup := setupMultiContainerPodTest(testT)
defer cleanup()

// Retry to ensure all processes start up and are detected
testutils.Retry(testT, 20, 4*time.Second, func(retryEventsT testutils.T) {
// Get the container groups.
groupedContainers := getGroupedContainerInstances(retryEventsT, string(pod.ID))

// Verify the number of containers.
require.Len(retryEventsT, groupedContainers, 2)
// Verify default sort is by name.
names := sliceutils.Map(groupedContainers, func(g ContainerNameGroup) string { return g.Name })
require.Equal(retryEventsT, names, []string{"1st", "2nd"})

// Use "at least" semantics: verify required processes exist, but allow extras.
// Rationale: Modern container images (especially nginx) run extensive initialization:
// - docker-entrypoint.sh and scripts in /docker-entrypoint.d/ (10-listen-on-ipv6, 20-envsubst, 30-tune-workers)
// - Short-lived utilities: /usr/bin/find, /bin/grep, /usr/bin/cut, /bin/sed, /usr/bin/basename, etc.
// - nginx worker processes (duplicate /usr/sbin/nginx)
// A typical nginx container may capture 20+ processes during startup. This approach focuses on
// verifying the main application processes exist without being brittle to image implementation details.

firstContainerEvents :=
sliceutils.Map(groupedContainers[0].Events, func(event Event) string { return event.Name })
retryEventsT.Logf("First container (%s) events: %+v", groupedContainers[0].Name, firstContainerEvents)

// First container: nginx (may see workers and ~20 docker-entrypoint processes)
requiredFirstContainer := []string{"/usr/sbin/nginx"}
require.Subsetf(retryEventsT, firstContainerEvents, requiredFirstContainer,
"First container: required processes: %v not found in events: %v", requiredFirstContainer, firstContainerEvents)

secondContainerEvents :=
sliceutils.Map(groupedContainers[1].Events, func(event Event) string { return event.Name })
retryEventsT.Logf("Second container (%s) events: %+v", groupedContainers[1].Name, secondContainerEvents)

// Second container: ubuntu running a loop with date and sleep
// TODO(ROX-31331): Collector cannot reliably detect /bin/sh /bin/date or /bin/sleep in ubuntu image,
// thus not including it in the required processes.
// If this flakes again, see ROX-31331 and follow-up on the discussion in the ticket.
requiredSecondContainer := []string{"/bin/sh"}
require.Subsetf(retryEventsT, secondContainerEvents, requiredSecondContainer,
"Second container: required processes: %v not found in events: %v", requiredSecondContainer, secondContainerEvents)

// Verify container start times are not after their earliest events
verifyStartTimeBeforeEvents(retryEventsT, groupedContainers[0].StartTime, groupedContainers[0].Events, "Container 0")
verifyStartTimeBeforeEvents(retryEventsT, groupedContainers[1].StartTime, groupedContainers[1].Events, "Container 1")

// Verify risk event timeline CSV
verifyRiskEventTimelineCSV(retryEventsT, deploymentID, append(firstContainerEvents, secondContainerEvents...))
})
}

Expand Down
4 changes: 4 additions & 0 deletions tests/images-to-prefetch.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,9 @@
# If renaming/moving this file, update the reference in scripts/ci/lib.sh

# Used by tests/yamls/multi-container-pod.yaml (TestPod, TestContainerInstances, roxctl verification)
quay.io/rhacs-eng/qa-multi-arch-nginx:latest
quay.io/rhacs-eng/qa-multi-arch:nginx-1.21.1
quay.io/rhacs-eng/qa-multi-arch:ubuntu-latest@sha256:64483f3496c1373bfd55348e88694d1c4d0c9b660dee6bfef5e12f43b9933b30

# Used by tests/tls_challenge_test.go
quay.io/rhacs-eng/qa-multi-arch:nginx-1-17-1
Loading
Loading