-
Notifications
You must be signed in to change notification settings - Fork 108
Expand file tree
/
Copy pathtest_cluster_health.py
More file actions
99 lines (82 loc) · 3.21 KB
/
test_cluster_health.py
File metadata and controls
99 lines (82 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from http import HTTPStatus
import time
from .helper import get, API_PREFIX
def test_cluster_health_check():
"""
Poll the cluster health endpoint until both runner and compiler are healthy.
- Repeatedly query the cluster health endpoint.
- If both runner and compiler report healthy=true, expect HTTP 200 and stop.
- Otherwise expect HTTP 503 while waiting.
- Timeout after 300 seconds.
"""
# Endpoint used by the Rust test client (mirrored here).
endpoint = f"{API_PREFIX}/cluster_healthz"
interval_s = 2
timeout_s = 300
start = time.monotonic()
while True:
resp = get(endpoint)
status_code = resp.status_code
try:
body = resp.json()
except Exception:
raise AssertionError(
f"Invalid JSON response from cluster health endpoint: status={status_code}, body={resp.text!r}"
)
all_healthy = bool(body.get("all_healthy", False) is True)
if all_healthy:
# Both components healthy; must be HTTP 200.
assert status_code == HTTPStatus.OK, (
f"Expected 200 when both runner and compiler are healthy; got {status_code}, body={body}"
)
break
else:
# While either component unhealthy, the API should signal service unavailable.
assert status_code == HTTPStatus.SERVICE_UNAVAILABLE, (
f"Expected 503 while unhealthy; got {status_code}, body={body}"
)
if time.monotonic() - start > timeout_s:
raise TimeoutError(
f"Timed out waiting for runner and compiler to become healthy (last body={body})"
)
time.sleep(interval_s)
def test_health_check():
"""
Poll the /healthz endpoint until it reports overall healthy or timeouts.
Success condition:
status code 200 and body == {"status": "healthy"}
Acceptable transient condition (database not ready yet):
status code 500 and body == {
"status": "unhealthy: unable to reach database (see logs for further details)"
}
"""
endpoint = "/healthz" # Not versioned in the Rust tests.
max_attempts = 30
attempt = 0
while True:
resp = get(endpoint)
status = resp.status_code
try:
body = resp.json()
except Exception:
raise AssertionError(
f"Invalid JSON from health endpoint: status={status}, body={resp.text!r}"
)
if status == HTTPStatus.OK and body == {"status": "healthy"}:
# Healthy
return
elif status == HTTPStatus.INTERNAL_SERVER_ERROR and body == {
"status": "unhealthy: unable to reach database (see logs for further details)"
}:
# Still unhealthy; keep polling within limit.
if attempt >= max_attempts:
raise TimeoutError(
f"Took too long for health check to return healthy "
f"(last status={status}, body={body})"
)
else:
raise AssertionError(
f"Unexpected health check response: status={status}, body={body}"
)
attempt += 1
time.sleep(1)