This repository was archived by the owner on Mar 23, 2026. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Expand file tree
/
Copy pathhttp.py
More file actions
326 lines (265 loc) · 11.4 KB
/
http.py
File metadata and controls
326 lines (265 loc) · 11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import logging
import math
import os
import re
from urllib.parse import parse_qs, parse_qsl, urlencode, urlparse, urlunparse
import requests
from requests.models import CaseInsensitiveDict, Response
from localstack import config
from .strings import to_str
# chunk size for file downloads
DOWNLOAD_CHUNK_SIZE = 1024 * 1024
ACCEPT = "accept"
LOG = logging.getLogger(__name__)
def uses_chunked_encoding(response):
return response.headers.get("Transfer-Encoding", "").lower() == "chunked"
def parse_chunked_data(data):
"""Parse the body of an HTTP message transmitted with chunked transfer encoding."""
data = (data or "").strip()
chunks = []
while data:
length = re.match(r"^([0-9a-zA-Z]+)\r\n.*", data)
if not length:
break
length = length.group(1).lower()
length = int(length, 16)
data = data.partition("\r\n")[2]
chunks.append(data[:length])
data = data[length:].strip()
return "".join(chunks)
def create_chunked_data(data, chunk_size: int = 80):
dl = len(data)
ret = ""
for i in range(dl // chunk_size):
ret += f"{hex(chunk_size)[2:]}\r\n"
ret += f"{data[i * chunk_size : (i + 1) * chunk_size]}\r\n\r\n"
if len(data) % chunk_size != 0:
ret += f"{hex(len(data) % chunk_size)[2:]}\r\n"
ret += f"{data[-(len(data) % chunk_size) :]}\r\n"
ret += "0\r\n\r\n"
return ret
def canonicalize_headers(headers: dict | CaseInsensitiveDict) -> dict:
if not headers:
return headers
def _normalize(name):
if name.lower().startswith(ACCEPT):
return name.lower()
return name
result = {_normalize(k): v for k, v in headers.items()}
return result
def add_path_parameters_to_url(uri: str, path_params: list):
url = urlparse(uri)
last_character = (
"/" if (len(url.path) == 0 or url.path[-1] != "/") and len(path_params) > 0 else ""
)
new_path = url.path + last_character + "/".join(path_params)
return urlunparse(url._replace(path=new_path))
def add_query_params_to_url(uri: str, query_params: dict) -> str:
"""
Add query parameters to the uri.
:param uri: the base uri it can contains path arguments and query parameters
:param query_params: new query parameters to be added
:return: the resulting URL
"""
# parse the incoming uri
url = urlparse(uri)
# parses the query part, if exists, into a dict
query_dict = dict(parse_qsl(url.query))
# updates the dict with new query parameters
query_dict.update(query_params)
# encodes query parameters
url_query = urlencode(query_dict)
# replaces the existing query
url_parse = url._replace(query=url_query)
return urlunparse(url_parse)
def make_http_request(
url: str, data: bytes | str = None, headers: dict[str, str] = None, method: str = "GET"
) -> Response:
return requests.request(
url=url, method=method, headers=headers, data=data, auth=NetrcBypassAuth(), verify=False
)
class NetrcBypassAuth(requests.auth.AuthBase):
def __call__(self, r):
return r
class _RequestsSafe:
"""Wrapper around requests library, which can prevent it from verifying
SSL certificates or reading credentials from ~/.netrc file"""
verify_ssl = True
def __getattr__(self, name):
method = requests.__dict__.get(name.lower())
if not method:
return method
def _wrapper(*args, **kwargs):
if "auth" not in kwargs:
kwargs["auth"] = NetrcBypassAuth()
url = kwargs.get("url") or (args[1] if name == "request" else args[0])
if not self.verify_ssl and url.startswith("https://") and "verify" not in kwargs:
kwargs["verify"] = False
return method(*args, **kwargs)
return _wrapper
# create safe_requests instance
safe_requests = _RequestsSafe()
def parse_request_data(method: str, path: str, data=None, headers=None) -> dict:
"""Extract request data either from query string as well as request body (e.g., for POST)."""
result = {}
headers = headers or {}
content_type = headers.get("Content-Type", "")
# add query params to result
parsed_path = urlparse(path)
result.update(parse_qs(parsed_path.query))
# add params from url-encoded payload
if method in ["POST", "PUT", "PATCH"] and (not content_type or "form-" in content_type):
# content-type could be either "application/x-www-form-urlencoded" or "multipart/form-data"
try:
params = parse_qs(to_str(data or ""))
result.update(params)
except Exception:
pass # probably binary / JSON / non-URL encoded payload - ignore
# select first elements from result lists (this is assuming we are not using parameter lists!)
result = {k: v[0] for k, v in result.items()}
return result
def get_proxies() -> dict[str, str]:
proxy_map = {}
if config.OUTBOUND_HTTP_PROXY:
proxy_map["http"] = config.OUTBOUND_HTTP_PROXY
if config.OUTBOUND_HTTPS_PROXY:
proxy_map["https"] = config.OUTBOUND_HTTPS_PROXY
return proxy_map
def download(
url: str,
path: str,
verify_ssl: bool = True,
timeout: float = None,
request_headers: dict | None = None,
quiet: bool = False,
) -> None:
"""Downloads file at url to the given path. Raises TimeoutError if the optional timeout (in secs) is reached.
If `quiet` is passed, do not log any status messages. Error messages are still logged.
"""
# make sure we're creating a new session here to enable parallel file downloads
s = requests.Session()
proxies = get_proxies()
if proxies:
s.proxies.update(proxies)
# Use REQUESTS_CA_BUNDLE path. If it doesn't exist, use the method provided settings.
# Note that a value that is not False, will result to True and will get the bundle file.
_verify = os.getenv("REQUESTS_CA_BUNDLE", verify_ssl)
r = None
try:
r = s.get(url, stream=True, verify=_verify, timeout=timeout, headers=request_headers)
# check status code before attempting to read body
if not r.ok:
raise Exception(f"Failed to download {url}, response code {r.status_code}")
total_size = 0
if r.headers.get("Content-Length"):
total_size = int(r.headers.get("Content-Length"))
total_downloaded = 0
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
if not quiet:
LOG.debug("Starting download from %s to %s", url, path)
with open(path, "wb") as f:
iter_length = 0
percentage_limit = next_percentage_record = 10 # print a log line for every 10%
iter_limit = (
1000000 # if we can't tell the percentage, print a log line for every 1MB chunk
)
for chunk in r.iter_content(DOWNLOAD_CHUNK_SIZE):
# explicitly check the raw stream, since the size from the chunk can be bigger than the amount of
# bytes transferred over the wire due to transparent decompression (f.e. GZIP)
new_total_downloaded = r.raw.tell()
iter_length += new_total_downloaded - total_downloaded
total_downloaded = new_total_downloaded
if chunk: # filter out keep-alive new chunks
f.write(chunk)
elif not quiet:
LOG.debug(
"Empty chunk %s (total %dK of %dK) from %s",
chunk,
total_downloaded / 1024,
total_size / 1024,
url,
)
if total_size > 0 and (
(current_percent := total_downloaded / total_size * 100)
>= next_percentage_record
):
# increment the limit for the next log output (ensure that there is max 1 log message per block)
# f.e. percentage_limit is 10, current percentage is 71: next log is earliest at 80%
next_percentage_record = (
math.floor(current_percent / percentage_limit) * percentage_limit
+ percentage_limit
)
if not quiet:
LOG.debug(
"Downloaded %d%% (total %dK of %dK) to %s",
current_percent,
total_downloaded / 1024,
total_size / 1024,
path,
)
iter_length = 0
elif total_size <= 0 and iter_length >= iter_limit:
if not quiet:
# print log message every x K if the total size is not known
LOG.debug(
"Downloaded %dK (total %dK) to %s",
iter_length / 1024,
total_downloaded / 1024,
path,
)
iter_length = 0
f.flush()
os.fsync(f)
if os.path.getsize(path) == 0:
LOG.warning("Zero bytes downloaded from %s, retrying", url)
download(url, path, verify_ssl)
return
if not quiet:
LOG.debug(
"Done downloading %s, response code %s, total %dK",
url,
r.status_code,
total_downloaded / 1024,
)
except requests.exceptions.ReadTimeout as e:
raise TimeoutError(f"Timeout ({timeout}) reached on download: {url} - {e}")
finally:
if r is not None:
r.close()
s.close()
def download_github_artifact(url: str, target_file: str, timeout: int = None):
"""Download file from main URL or fallback URL (to avoid firewall errors if github.com is blocked).
Optionally allows to define a timeout in seconds."""
def do_download(
download_url: str, request_headers: dict | None = None, print_error: bool = False
):
try:
download(download_url, target_file, timeout=timeout, request_headers=request_headers)
return True
except Exception as e:
if print_error:
LOG.error(
"Unable to download Github artifact from %s to %s: %s %s",
url,
target_file,
e,
exc_info=LOG.isEnabledFor(logging.DEBUG),
)
# if a GitHub API token is set, use it to avoid rate limiting issues
gh_token = os.environ.get("GITHUB_API_TOKEN")
gh_auth_headers = None
if gh_token:
gh_auth_headers = {"authorization": f"Bearer {gh_token}"}
result = do_download(url, request_headers=gh_auth_headers)
if not result:
# TODO: use regex below to allow different branch names than "master"
url = url.replace("https://github.com", "https://cdn.jsdelivr.net/gh")
# The URL structure is https://cdn.jsdelivr.net/gh/user/repo@branch/file.js
url = url.replace("/raw/master/", "@master/")
# Do not send the GitHub auth token to the CDN
do_download(url, print_error=True)
# TODO move to aws_responses.py?
def replace_response_content(response, pattern, replacement):
content = to_str(response.content or "")
response._content = re.sub(pattern, replacement, content)