forked from jwasham/practice-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbooks.py
More file actions
51 lines (38 loc) · 1.38 KB
/
books.py
File metadata and controls
51 lines (38 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from bs4 import BeautifulSoup
import csv
import pprint
import re
import requests
import time
def get_book_data(element):
"""given a BeautifulSoup Tag representing a book,
extract the book's details and return a dict"""
title = element.find('div', 'thumbheader').a.text
by_author = element.find('div', 'AuthorName').text
authors = [x.strip()
for x in re.sub("by ", '', by_author, flags=re.IGNORECASE).split(',')
]
# price = element.find('span', 'price').text.strip()
return {
'title': title,
# 'price': price,
'authors': authors,
}
def main():
NUM_PAGES = 31
books = []
base_url = 'http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page='
for page_num in range(1, NUM_PAGES + 1):
print("souping page", page_num, ",", len(books), " found so far")
html = requests.get(base_url + str(page_num)).text
soup = BeautifulSoup(html, 'html5lib')
books.extend([get_book_data(group) for group in soup('td', 'thumbtext')])
time.sleep(30)
with open('books.txt', 'w') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(["Title", "Authors"])
for book in books:
writer.writerow([book['title'], ', '.join(book['authors'])])
pprint.pprint(books)
if __name__ == '__main__':
main()