-
Notifications
You must be signed in to change notification settings - Fork 242
Expand file tree
/
Copy pathutils.py
More file actions
64 lines (51 loc) · 2.02 KB
/
utils.py
File metadata and controls
64 lines (51 loc) · 2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""Dataset utilities
"""
import logging
import math
from pathlib import Path
import requests
from urllib.parse import urlparse
from tqdm import tqdm
log = logging.getLogger(__name__)
def maybe_download(src_url: str, dst_filepath: str, expected_bytes=None) -> bool:
"""Check if file exists. If not, download and return True. Else, return False.
Refs:
https://github.com/microsoft/recommenders/blob/main/recommenders/datasets/download_utils.py
Args:
src_url: Source file URL.
dst_filepath: Destination file path.
expected_bytes (optional): Expected bytes of the file to verify.
Returns:
bool: Whether the file was downloaded or not
"""
dst_filepath = Path(dst_filepath)
if dst_filepath.is_file():
log.info(f"File {str(dst_filepath)} already exists")
return False
# Check dir if exists. If not, create one
dst_filepath.parent.mkdir(parents=True, exist_ok=True)
response = requests.get(src_url, stream=True)
if response.status_code == 200:
log.info(f"Downloading {src_url}")
total_size = int(response.headers.get("content-length", 0))
block_size = 1024
num_iterables = math.ceil(total_size / block_size)
with open(str(dst_filepath.resolve()), "wb") as file:
for data in tqdm(
response.iter_content(block_size),
total=num_iterables,
unit="KB",
unit_scale=True,
):
file.write(data)
# Verify the file size
if expected_bytes is not None and expected_bytes != dst_filepath.stat().st_size:
# Delete the file since the size is not the same as the expected one.
dst_filepath.unlink()
raise IOError(f"Failed to verify {str(dst_filepath)}. Maybe interrupted while downloading?")
else:
return True
else:
response.raise_for_status()
# If not HTTPError yet still cannot download
raise Exception(f"Problem downloading {src_url}")