Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions tests/e2e/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,45 @@ setup_automation_flavor_e2e_cluster() {
fi
}

# When working as expected it takes less than one minute for the API server to
# reach ready. Often times out on OSD. If this call fails in CI we need to
# identify the source of pull/scheduling latency, request throttling, etc.
# I tried increasing the timeout from 5m to 20m for OSD but it did not help.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you set the wait timeout to be 5 mins. Do you know why did OSD take so long to reinstate? How long does it usually take?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@c-du I literally copied the top half of wait_for_api where it checks to make sure the deployment is running. That includes the copying of the comment. So I don't know.

wait_for_central_db() {
info "Waiting for Central DB to start"

start_time="$(date '+%s')"
max_seconds=300

while true; do
central_db_json="$(kubectl -n stackrox get deploy/central-db -o json)"
replicas="$(jq '.status.replicas' <<<"$central_db_json")"
ready_replicas="$(jq '.status.readyReplicas' <<<"$central_db_json")"
curr_time="$(date '+%s')"
elapsed_seconds=$(( curr_time - start_time ))

# Ready case
if [[ "$replicas" == 1 && "$ready_replicas" == 1 ]]; then
sleep 30
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this if replicas are ready?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@janisz I mirrored (actually copied) this from the existing wait_for_api. So I assumed that was there for reasons. I could take it out if everyone likes, but I don't think it does any harm.

break
fi

# Timeout case
if (( elapsed_seconds > max_seconds )); then
kubectl -n stackrox get pod -o wide
kubectl -n stackrox get deploy -o wide
echo >&2 "wait_for_central_db() timeout after $max_seconds seconds."
exit 1
fi

# Otherwise report and retry
echo "waiting ($elapsed_seconds/$max_seconds)"
sleep 5
done

info "Central DB deployment is ready."
}

if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then
if [[ "$#" -lt 1 ]]; then
usage
Expand Down
22 changes: 15 additions & 7 deletions tests/upgrade/postgres_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,9 @@ test_upgrade_paths() {
verifyNoPostgresAccessScopes

# Now go back up to Postgres
CURRENT_TAG="$(make --quiet tag)"
kubectl -n stackrox set env deploy/central ROX_POSTGRES_DATASTORE=true
kubectl -n stackrox set image deploy/central "central=$REGISTRY/main:$(make --quiet tag)"
kubectl -n stackrox set image deploy/central "central=$REGISTRY/main:$CURRENT_TAG"
wait_for_api
wait_for_scanner_to_be_ready

Expand Down Expand Up @@ -182,12 +183,19 @@ test_upgrade_paths() {
wait_for_api
kubectl -n stackrox delete po "$(kubectl -n stackrox get po -l app=central-db -o=jsonpath='{.items[0].metadata.name}')" --grace-period=0
wait_for_api
wait_for_central_db

checkForRocksAccessScopes
checkForPostgresAccessScopes

validate_upgrade "01-bounce-db-after-upgrade" "bounce central db after postgres upgrade" "268c98c6-e983-4f4e-95d2-9793cebddfd7"
collect_and_check_stackrox_logs "$log_output_dir" "01_post_bounce-db"
validate_upgrade "02-bounce-db-after-upgrade" "bounce central db after postgres upgrade" "268c98c6-e983-4f4e-95d2-9793cebddfd7"

# Since we bounced the DB we may see some errors. Those need to be allowed in the case of this test ONLY.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Double spaces

echo "# postgres was bounced, may see some connection errors" >> scripts/ci/logcheck/allowlist-patterns
echo "FATAL: terminating connection due to administrator command \(SQLSTATE 57P01\)" >> scripts/ci/logcheck/allowlist-patterns
echo >> scripts/ci/logcheck/allowlist-patterns

collect_and_check_stackrox_logs "$log_output_dir" "02_post_bounce-db"

info "Fetching a sensor bundle for cluster 'remote'"
rm -rf sensor-remote
Expand All @@ -196,10 +204,10 @@ test_upgrade_paths() {

info "Installing sensor"
./sensor-remote/sensor.sh
kubectl -n stackrox set image deploy/sensor "*=$REGISTRY/main:$(make --quiet tag)"
kubectl -n stackrox set image deploy/admission-control "*=$REGISTRY/main:$(make --quiet tag)"
kubectl -n stackrox set image ds/collector "collector=$REGISTRY/collector:$(cat COLLECTOR_VERSION)" \
"compliance=$REGISTRY/main:$(make --quiet tag)"
kubectl -n stackrox set image deploy/sensor "*=$REGISTRY/main:$CURRENT_TAG"
kubectl -n stackrox set image deploy/admission-control "*=$REGISTRY/main:$CURRENT_TAG"
kubectl -n stackrox set image ds/collector "collector=$REGISTRY/collector:$CURRENT_TAG" \
"compliance=$REGISTRY/main:$CURRENT_TAG"

sensor_wait

Expand Down