-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Expand file tree
/
Copy pathdownload_imdb.py
More file actions
53 lines (40 loc) · 1.35 KB
/
download_imdb.py
File metadata and controls
53 lines (40 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
"""
Fetch and parse people names from the IMDb.
Usage:
$ python download_imdb.py
"""
import csv
import gzip
import shutil
import tempfile
import urllib.request
def main():
"""Script entry point."""
print("Fetching data from IMDb...")
with open("names.txt", "w", encoding="utf-8") as destination:
destination.writelines(names())
with (
open("names.txt", encoding="utf-8") as source,
open("sorted_names.txt", "w", encoding="utf-8") as destination,
):
destination.writelines(sorted(source.readlines()))
print('Created "names.txt" and "sorted_names.txt"')
def names():
"""Return a generator of names with a trailing newline."""
url = "https://datasets.imdbws.com/name.basics.tsv.gz"
with urllib.request.urlopen(url) as response:
with tempfile.NamedTemporaryFile(mode="w+b") as archive:
shutil.copyfileobj(response, archive)
archive.seek(0)
with gzip.open(archive, mode="rt", encoding="utf-8") as tsv_file:
tsv = csv.reader(tsv_file, delimiter="\t")
next(tsv) # Skip the header
for record in tsv:
full_name = record[1]
yield f"{full_name}\n"
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("Aborted")