-
Notifications
You must be signed in to change notification settings - Fork 266
Expand file tree
/
Copy pathdownload.py
More file actions
executable file
·93 lines (75 loc) · 3.13 KB
/
download.py
File metadata and controls
executable file
·93 lines (75 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
"""Download and decompress the Stack Exchange data dump from the Internet Archive."""
import os
import click
import py7zr
import requests # type: ignore
@click.command()
@click.argument("subdomain")
@click.option(
"--data-dir",
default="python/graphframes/tutorials/data",
help="Directory to store downloaded files",
)
@click.option(
"--extract/--no-extract",
default=True,
help="Whether to extract the archive after download",
)
def stackexchange(subdomain: str, data_dir: str, extract: bool) -> None:
"""Download Stack Exchange archive for a given SUBDOMAIN.
Example: python/graphframes/tutorials/download.py stats.meta
Note: This won't work for stackoverflow.com archives due to size.
"""
# Create data directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)
# Construct archive URL and filename
archive_url = f"https://archive.org/download/stackexchange/{subdomain}.stackexchange.com.7z"
archive_path = os.path.join(data_dir, f"{subdomain}.stackexchange.com.7z")
click.echo(f"Downloading archive from {archive_url}")
try:
# Download the file with retries
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
response = requests.get(archive_url, stream=True)
response.raise_for_status() # Raise exception for bad status codes
break
except (
requests.exceptions.RequestException,
requests.exceptions.ConnectionError,
requests.exceptions.HTTPError,
requests.exceptions.Timeout,
) as e:
retry_count += 1
if retry_count == max_retries:
click.echo(
f"Failed to download after {max_retries} attempts: {e}",
err=True,
)
raise click.Abort()
click.echo(f"Download attempt {retry_count} failed, retrying...")
total_size = int(response.headers.get("content-length", 0))
with click.progressbar(length=total_size, label="Downloading") as bar: # type: ignore
with open(archive_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
bar.update(len(chunk))
click.echo(f"Download complete: {archive_path}")
# Extract if requested
if extract:
click.echo("Extracting archive...")
output_dir = f"{subdomain}.stackexchange.com"
with py7zr.SevenZipFile(archive_path, mode="r") as z:
z.extractall(path=os.path.join(data_dir, output_dir))
click.echo(f"Extraction complete: {output_dir}")
except requests.exceptions.RequestException as e:
click.echo(f"Error downloading archive: {e}", err=True)
raise click.Abort()
except py7zr.Bad7zFile as e:
click.echo(f"Error extracting archive: {e}", err=True)
raise click.Abort()
if __name__ == "__main__":
stackexchange()