-
Notifications
You must be signed in to change notification settings - Fork 174
Expand file tree
/
Copy pathgke.sh
More file actions
executable file
·417 lines (359 loc) · 14.3 KB
/
gke.sh
File metadata and controls
executable file
·417 lines (359 loc) · 14.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
#!/usr/bin/env bash
# A collection of GKE related reusable bash functions for CI
SCRIPTS_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/../.. && pwd)"
# shellcheck source=../../scripts/ci/lib.sh
source "$SCRIPTS_ROOT/scripts/ci/lib.sh"
# shellcheck source=../../scripts/ci/gcp.sh
source "$SCRIPTS_ROOT/scripts/ci/gcp.sh"
set -euo pipefail
provision_gke_cluster() {
info "Provisioning a GKE cluster"
setup_gcp
assign_env_variables "$@"
create_cluster
}
assign_env_variables() {
info "Assigning environment variables for later steps"
if [[ "$#" -ne 1 ]]; then
die "missing args. usage: assign_env_variables <cluster-id>"
fi
local cluster_id="$1"
ensure_CI
local build_num
if is_OPENSHIFT_CI; then
require_environment "BUILD_ID"
build_num="${BUILD_ID}"
elif is_GITHUB_ACTIONS; then
require_environment "GITHUB_RUN_ID"
build_num="${GITHUB_RUN_ID}"
else
die "Support is missing for this CI environment"
fi
local cluster_name="rox-ci-${cluster_id}-${build_num}"
cluster_name="${cluster_name:0:40}" # (for GKE name limit)
ci_export CLUSTER_NAME "$cluster_name"
echo "Assigned cluster name is $cluster_name"
choose_release_channel
choose_cluster_version
}
choose_release_channel() {
if ! is_in_PR_context; then
GKE_RELEASE_CHANNEL="${GKE_RELEASE_CHANNEL:-stable}"
elif pr_has_label ci-gke-use-rapid-channel; then
GKE_RELEASE_CHANNEL="rapid"
elif pr_has_label ci-gke-use-regular-channel; then
GKE_RELEASE_CHANNEL="regular"
elif pr_has_label ci-gke-use-stable-channel; then
GKE_RELEASE_CHANNEL="stable"
elif pr_has_pragma gke_release_channel; then
GKE_RELEASE_CHANNEL="$(pr_get_pragma gke_release_channel)"
fi
}
choose_cluster_version() {
if is_in_PR_context && pr_has_pragma gke_cluster_version; then
GKE_CLUSTER_VERSION="$(pr_get_pragma gke_cluster_version)"
fi
if [[ "${GKE_CLUSTER_VERSION:-}" == "latest" ]]; then
GKE_CLUSTER_VERSION="$(gcloud container get-server-config --format json | jq -r ".validMasterVersions[0]")"
elif [[ "${GKE_CLUSTER_VERSION:-}" == "oldest" ]]; then
GKE_CLUSTER_VERSION="$(gcloud container get-server-config --format json | jq -r ".validMasterVersions[-1]")"
fi
if [[ "${GKE_CLUSTER_VERSION:-}" == "null" ]]; then
echo "WARNING: Unable to extract version from gcloud config."
echo "Valid versions are:"
gcloud container get-server-config --format json | jq .validMasterVersions
unset GKE_CLUSTER_VERSION
fi
}
create_cluster() {
info "Creating a GKE cluster"
# Store requested timestamp to create log query link with time range.
date -u +"%Y-%m-%dT%H:%M:%SZ" > /tmp/GKE_CLUSTER_REQUESTED_TIMESTAMP
ensure_CI
require_environment "CLUSTER_NAME"
local tags="stackrox-ci"
local labels="stackrox-ci=true"
if is_OPENSHIFT_CI; then
require_environment "JOB_NAME"
require_environment "BUILD_ID"
build_num="${BUILD_ID}"
job_name="${JOB_NAME}"
elif is_GITHUB_ACTIONS; then
require_environment "GITHUB_JOB"
require_environment "GITHUB_RUN_ID"
build_num="${GITHUB_RUN_ID}"
job_name="${GITHUB_JOB}"
else
die "Support is missing for this CI environment"
fi
# Refresher on bash shell parameter expansion:
# https://www.gnu.org/software/bash/manual/bash.html#Shell-Parameter-Expansion
# ${VAR//./-} : Replaces all "." with a "-"
# ${VAR/%-/} : Deletes the last "-"
# ${VAR,,} : Converts all alphabetic to their lowercase form
tags="${tags},stackrox-ci-${job_name:0:50}"
tags="${tags//./-}"
tags="${tags/%-/}"
labels="${labels},stackrox-ci-job=${job_name:0:63}"
labels="${labels//./-}"
labels="${labels/%-/}"
labels="${labels},stackrox-ci-build-id=${build_num:0:63}"
labels="${labels//./-}"
labels="${labels/%-/}"
if is_in_PR_context; then
labels="${labels},pr=$(get_PR_number)"
fi
# lowercase
tags="${tags,,}"
labels="${labels,,}"
### Network Sizing ###
# The overall subnetwork ("--create-subnetwork") is used for nodes.
# The "cluster" secondary range is for pods ("--cluster-ipv4-cidr").
# The "services" secondary range is for ClusterIP services ("--services-ipv4-cidr").
# See https://cloud.google.com/kubernetes-engine/docs/how-to/alias-ips#cluster_sizing.
REGION=us-east4
NUM_NODES="${NUM_NODES:-3}"
GCP_IMAGE_TYPE="${GCP_IMAGE_TYPE:-UBUNTU_CONTAINERD}"
POD_SECURITY_POLICIES="${POD_SECURITY_POLICIES:-false}"
GKE_RELEASE_CHANNEL="${GKE_RELEASE_CHANNEL:-stable}"
MACHINE_TYPE="${MACHINE_TYPE:-e2-standard-4}"
DISK_SIZE_GB=${DISK_SIZE_GB:-80}
echo "Creating ${NUM_NODES} node cluster with image type \"${GCP_IMAGE_TYPE}\" and ${DISK_SIZE_GB}GB disks."
if [[ -n "${GKE_CLUSTER_VERSION:-}" ]]; then
ensure_supported_cluster_version
echo "Using GKE cluster version: ${GKE_CLUSTER_VERSION} (which overrides release channel ${GKE_RELEASE_CHANNEL})"
VERSION_ARGS=(--cluster-version "${GKE_CLUSTER_VERSION}" --no-enable-autoupgrade)
else
echo "Using GKE release channel: $GKE_RELEASE_CHANNEL"
VERSION_ARGS=(--release-channel "${GKE_RELEASE_CHANNEL}")
fi
PSP_ARG=
if [[ "${POD_SECURITY_POLICIES}" == "true" ]]; then
PSP_ARG="--enable-pod-security-policy"
fi
zones=$(gcloud compute zones list --format="value(name,region.basename(),status)" | awk "/${REGION}\tUP\$/{print \$1}" | shuf)
success=0
for zone in $zones; do
echo "Trying zone $zone"
ci_export ZONE "$zone"
gcloud config set compute/zone "${zone}"
status=0
# shellcheck disable=SC2153
timeout 830 gcloud beta container clusters create \
--machine-type "${MACHINE_TYPE}" \
--num-nodes "${NUM_NODES}" \
--disk-type=pd-ssd \
--disk-size="${DISK_SIZE_GB}GB" \
--create-subnetwork range=/28 \
--cluster-ipv4-cidr=/20 \
--services-ipv4-cidr=/24 \
--enable-ip-alias \
--enable-network-policy \
--no-enable-autorepair \
"${VERSION_ARGS[@]}" \
--image-type "${GCP_IMAGE_TYPE}" \
--tags="${tags}" \
--labels="${labels}" \
${PSP_ARG} \
"${CLUSTER_NAME}" || status="$?"
if [[ "${status}" == 0 ]]; then
success=1
break
elif [[ "${status}" == 124 ]]; then
info "gcloud command timed out. Checking to see if cluster is still creating"
if ! gcloud container clusters describe "${CLUSTER_NAME}" >/dev/null; then
info "Create cluster did not create the cluster in Google. Trying a different zone..."
else
for i in {1..60}; do
if [[ "$(gcloud container clusters describe "${CLUSTER_NAME}" --format json | jq -r .status)" == "RUNNING" ]]; then
success=1
break
fi
sleep 20
info "Waiting for cluster ${CLUSTER_NAME} in ${zone} to move to running state (wait $i of 60)"
done
fi
if [[ "${success}" == 1 ]]; then
info "Successfully launched cluster ${CLUSTER_NAME}"
local kubeconfig="${KUBECONFIG:-${HOME}/.kube/config}"
ls -l "${kubeconfig}" || true
gcloud container clusters get-credentials "$CLUSTER_NAME"
ls -l "${kubeconfig}" || true
break
fi
info "Timed out"
info "Attempting to delete the cluster before trying another zone"
gcloud container clusters delete "${CLUSTER_NAME}" || {
info "An error occurred deleting the cluster: $?"
true
}
fi
done
if [[ "${success}" == "0" ]]; then
info "Cluster creation failed"
return 1
fi
add_a_maintenance_exclusion
}
add_a_maintenance_exclusion() {
from_now="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
plus_five_epoch=$(($(date -u '+%s') + 5*3600))
plus_five="$(date -u --date=@${plus_five_epoch} +"%Y-%m-%dT%H:%M:%SZ")"
gcloud container clusters update "${CLUSTER_NAME}" \
--add-maintenance-exclusion-name leave-these-clusters-alone \
--add-maintenance-exclusion-start "${from_now}" \
--add-maintenance-exclusion-end "${plus_five}" \
--add-maintenance-exclusion-scope no_upgrades
}
wait_for_cluster() {
info "Waiting for a GKE cluster to stabilize"
while [[ $(kubectl -n kube-system get pod | tail -n +2 | wc -l) -lt 2 ]]; do
echo "Still waiting for kubernetes to create initial kube-system pods"
sleep 1
done
local grace_period=30
while true; do
kubectl -n kube-system get pod
local numstarting
numstarting=$(kubectl -n kube-system get pod -o json | jq '[(.items[].status.containerStatuses // [])[].ready | select(. | not)] | length')
if ((numstarting == 0)); then
local last_start_ts
last_start_ts="$(kubectl -n kube-system get pod -o json | jq '[(.items[].status.containerStatuses // [])[] | (.state.running.startedAt // (now | todate)) | fromdate] | max')"
local curr_ts
curr_ts="$(date '+%s')"
local remaining_grace_period
remaining_grace_period=$((last_start_ts + grace_period - curr_ts))
if ((remaining_grace_period <= 0)); then
break
fi
echo "Waiting for another $remaining_grace_period seconds for kube-system pods to stabilize"
sleep "$remaining_grace_period"
fi
echo "Waiting for ${numstarting} kube-system containers to be initialized"
sleep 10
done
}
ensure_supported_cluster_version() {
local match
match=$(gcloud container get-server-config --format json | jq "[.validMasterVersions | .[] | select(.|test(\"^${GKE_CLUSTER_VERSION}\"))][0]")
if [[ -z "${match}" || "${match}" == "null" ]]; then
echo "ERROR: A supported version cannot be found that matches ${GKE_CLUSTER_VERSION}."
echo "Valid master versions are:"
gcloud container get-server-config --format json | jq .validMasterVersions
exit 1
fi
GKE_CLUSTER_VERSION=$(sed -e 's/^"//' -e 's/"$//' <<<"${match}")
}
refresh_gke_token() {
info "Starting a GKE token refresh loop"
require_environment "ZONE"
require_environment "CLUSTER_NAME"
local real_kubeconfig="${KUBECONFIG:-${HOME}/.kube/config}"
# refresh token every 15m
local pid
while true; do
sleep 900 &
pid="$!"
kill_sleep() {
# shellcheck disable=SC2317
echo "refresh_gke_token() terminated, killing the background sleep ($pid)"
# shellcheck disable=SC2317
kill "$pid"
}
trap kill_sleep SIGINT SIGTERM
wait "$pid"
info "Refreshing the GKE auth token"
gcloud config config-helper --force-auth-refresh >/dev/null
echo >/tmp/kubeconfig-new
chmod 0600 /tmp/kubeconfig-new
# shellcheck disable=SC2153
KUBECONFIG=/tmp/kubeconfig-new gcloud container clusters get-credentials --project acs-san-stackroxci --zone "$ZONE" "$CLUSTER_NAME"
KUBECONFIG=/tmp/kubeconfig-new kubectl get ns >/dev/null
mv /tmp/kubeconfig-new "$real_kubeconfig"
done
}
teardown_gke_cluster() {
local canceled="${1:-false}"
local byodb="${BYODB_TEST:-false}"
info "Tearing down the GKE cluster: ${CLUSTER_NAME:-}, canceled: ${canceled}"
require_environment "CLUSTER_NAME"
require_executable "gcloud"
if [[ "${canceled}" == "false" ]] &&
[[ "${byodb}" == "false" ]]
then
# (prefix output to avoid triggering prow log focus)
"$SCRIPTS_ROOT/scripts/ci/cleanup-deployment.sh" 2>&1 | sed -e 's/^/out: /' || true
fi
for i in {1..10}; do
gcloud container clusters describe "${CLUSTER_NAME}" --format "flattened(status)"
if [[ ! "$(gcloud container clusters describe "${CLUSTER_NAME}" --format 'get(status)')" =~ PROVISIONING|RECONCILING ]]; then
break
fi
info "Before deleting, waiting for cluster ${CLUSTER_NAME} to leave provisioning state (wait $i of 10)"
sleep 60
done
gcloud container clusters delete "$CLUSTER_NAME" --async
info "Cluster deleting asynchronously"
create_log_explorer_links
}
create_log_explorer_links() {
if [[ -z "${ARTIFACT_DIR:-}" ]]; then
info "No place for artifacts, skipping generation of links to logs explorer"
return
fi
artifact_file="$ARTIFACT_DIR/gke-logs.html"
cat > "$artifact_file" <<- HEAD
<html>
<head>
<title>GKE Logs Explorer</title>
<style>
body { color: #e8e8e8; background-color: #424242; font-family: "Roboto", "Helvetica", "Arial", sans-serif }
a { color: #ff8caa }
a:visited { color: #ff8caa }
</style>
</head>
<body>
<p>These links require a 'right-click -> open in new tab'.
The authUser is the number for your @redhat.com account.
You can check this by clicking on the user avatar in the top right corner of Google Cloud Console page
after following the link.</p>
<ul style="padding-bottom: 28px; padding-left: 30px; font-family: Roboto,Helvetica,Arial,sans-serif;">
HEAD
local start_ts
start_ts="$(cat /tmp/GKE_CLUSTER_REQUESTED_TIMESTAMP)"
local end_ts
end_ts="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
local project
project="$(gcloud config get project --quiet)"
for authUser in {0..2}; do
cat << LINK |
<li>
<a target="_blank" href="https://console.cloud.google.com/logs/query
;query=
resource.type%3D%22k8s_container%22%0A
resource.labels.cluster_name%3D%22${CLUSTER_NAME}%22%0A
resource.labels.namespace_name%3D%22stackrox%22%0A
;timeRange=${start_ts}%2F${end_ts}
;cursorTimestamp=${start_ts}
?authuser=${authUser}
&project=${project}
&orgonly=true
&supportedpurview=organizationId">authUser $authUser</a>
</li>
LINK
tr -d '\n' >> "$artifact_file"
done
cat >> "$artifact_file" <<- FOOT
</ul>
</body>
</html>
FOOT
}
if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then
if [[ "$#" -lt 1 ]]; then
die "When invoked at the command line a method is required."
fi
fn="$1"
shift
"$fn" "$@"
fi