-
Notifications
You must be signed in to change notification settings - Fork 174
Dashrews/ROX-13253 wait for central-db to come back after bounce and allow FATAL connection lost error #3537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
57551ae
55fcc7e
12f8196
5474100
bf86c56
72dfd79
5b0441a
f0ca636
19b7a17
2b59fbd
3a809c7
fa4ffe3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -439,6 +439,45 @@ setup_automation_flavor_e2e_cluster() { | |
| fi | ||
| } | ||
|
|
||
| # When working as expected it takes less than one minute for the API server to | ||
| # reach ready. Often times out on OSD. If this call fails in CI we need to | ||
| # identify the source of pull/scheduling latency, request throttling, etc. | ||
| # I tried increasing the timeout from 5m to 20m for OSD but it did not help. | ||
| wait_for_central_db() { | ||
| info "Waiting for Central DB to start" | ||
|
|
||
| start_time="$(date '+%s')" | ||
| max_seconds=300 | ||
|
|
||
| while true; do | ||
| central_db_json="$(kubectl -n stackrox get deploy/central-db -o json)" | ||
| replicas="$(jq '.status.replicas' <<<"$central_db_json")" | ||
| ready_replicas="$(jq '.status.readyReplicas' <<<"$central_db_json")" | ||
| curr_time="$(date '+%s')" | ||
| elapsed_seconds=$(( curr_time - start_time )) | ||
|
|
||
| # Ready case | ||
| if [[ "$replicas" == 1 && "$ready_replicas" == 1 ]]; then | ||
| sleep 30 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this if replicas are ready?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @janisz I mirrored (actually copied) this from the existing wait_for_api. So I assumed that was there for reasons. I could take it out if everyone likes, but I don't think it does any harm. |
||
| break | ||
| fi | ||
|
|
||
| # Timeout case | ||
| if (( elapsed_seconds > max_seconds )); then | ||
| kubectl -n stackrox get pod -o wide | ||
| kubectl -n stackrox get deploy -o wide | ||
| echo >&2 "wait_for_central_db() timeout after $max_seconds seconds." | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Otherwise report and retry | ||
| echo "waiting ($elapsed_seconds/$max_seconds)" | ||
| sleep 5 | ||
| done | ||
|
|
||
| info "Central DB deployment is ready." | ||
| } | ||
|
|
||
| if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then | ||
| if [[ "$#" -lt 1 ]]; then | ||
| usage | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -153,8 +153,9 @@ test_upgrade_paths() { | |
| verifyNoPostgresAccessScopes | ||
|
|
||
| # Now go back up to Postgres | ||
| CURRENT_TAG="$(make --quiet tag)" | ||
| kubectl -n stackrox set env deploy/central ROX_POSTGRES_DATASTORE=true | ||
| kubectl -n stackrox set image deploy/central "central=$REGISTRY/main:$(make --quiet tag)" | ||
| kubectl -n stackrox set image deploy/central "central=$REGISTRY/main:$CURRENT_TAG" | ||
| wait_for_api | ||
| wait_for_scanner_to_be_ready | ||
|
|
||
|
|
@@ -182,12 +183,19 @@ test_upgrade_paths() { | |
| wait_for_api | ||
| kubectl -n stackrox delete po "$(kubectl -n stackrox get po -l app=central-db -o=jsonpath='{.items[0].metadata.name}')" --grace-period=0 | ||
| wait_for_api | ||
| wait_for_central_db | ||
|
|
||
| checkForRocksAccessScopes | ||
| checkForPostgresAccessScopes | ||
|
|
||
| validate_upgrade "01-bounce-db-after-upgrade" "bounce central db after postgres upgrade" "268c98c6-e983-4f4e-95d2-9793cebddfd7" | ||
| collect_and_check_stackrox_logs "$log_output_dir" "01_post_bounce-db" | ||
| validate_upgrade "02-bounce-db-after-upgrade" "bounce central db after postgres upgrade" "268c98c6-e983-4f4e-95d2-9793cebddfd7" | ||
|
|
||
| # Since we bounced the DB we may see some errors. Those need to be allowed in the case of this test ONLY. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Double spaces |
||
| echo "# postgres was bounced, may see some connection errors" >> scripts/ci/logcheck/allowlist-patterns | ||
| echo "FATAL: terminating connection due to administrator command \(SQLSTATE 57P01\)" >> scripts/ci/logcheck/allowlist-patterns | ||
| echo >> scripts/ci/logcheck/allowlist-patterns | ||
|
|
||
| collect_and_check_stackrox_logs "$log_output_dir" "02_post_bounce-db" | ||
|
|
||
| info "Fetching a sensor bundle for cluster 'remote'" | ||
| rm -rf sensor-remote | ||
|
|
@@ -196,10 +204,10 @@ test_upgrade_paths() { | |
|
|
||
| info "Installing sensor" | ||
| ./sensor-remote/sensor.sh | ||
| kubectl -n stackrox set image deploy/sensor "*=$REGISTRY/main:$(make --quiet tag)" | ||
| kubectl -n stackrox set image deploy/admission-control "*=$REGISTRY/main:$(make --quiet tag)" | ||
| kubectl -n stackrox set image ds/collector "collector=$REGISTRY/collector:$(cat COLLECTOR_VERSION)" \ | ||
| "compliance=$REGISTRY/main:$(make --quiet tag)" | ||
| kubectl -n stackrox set image deploy/sensor "*=$REGISTRY/main:$CURRENT_TAG" | ||
| kubectl -n stackrox set image deploy/admission-control "*=$REGISTRY/main:$CURRENT_TAG" | ||
| kubectl -n stackrox set image ds/collector "collector=$REGISTRY/collector:$CURRENT_TAG" \ | ||
| "compliance=$REGISTRY/main:$CURRENT_TAG" | ||
|
|
||
| sensor_wait | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see you set the wait timeout to be 5 mins. Do you know why did OSD take so long to reinstate? How long does it usually take?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@c-du I literally copied the top half of wait_for_api where it checks to make sure the deployment is running. That includes the copying of the comment. So I don't know.