-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathdata_scraper.py
More file actions
130 lines (100 loc) · 3.9 KB
/
data_scraper.py
File metadata and controls
130 lines (100 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from collections import defaultdict
from os import makedirs, rename, remove
from os.path import join, exists
from re import match
# 3rd party
import requests
from bs4 import BeautifulSoup
from tinydb import TinyDB
from settings import DB_DIR
SCHEDULE = 'schedule.html'
TERM_CODES = {'fh': '201911', 'da': '201912'}
HEADERS = ('course', 'CRN', 'desc', 'status', 'days', 'time', 'start', 'end',
'room', 'campus', 'units', 'instructor', 'seats', 'wait_seats', 'wait_cap')
COURSE_PATTERN = r'[FD]0*(\d*\w?)\.?\d*([YWZH])?'
def main():
if not exists(DB_DIR):
makedirs(DB_DIR, exist_ok=True)
for term in TERM_CODES.values():
temp_path = join(DB_DIR, 'temp.json')
temp = TinyDB(temp_path)
content = mine(term)
parse(content, db=temp)
if rename(temp_path, join(DB_DIR, f'{term}_database.json')):
remove(temp_path)
db = TinyDB(join(DB_DIR, f'{term}_database.json'))
print(term, db.tables())
def mine(term, write=False):
'''
Mine will hit the database for foothill's class listings and write it to a file.
:param term: (str) the term to mine
:param write: (bool) write to file?
:return res.content: (json) the html body
'''
headers = {
'Origin': 'https://banssb.fhda.edu',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'FoothillAPI',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html, */*; q=0.01',
'Referer': 'https://banssb.fhda.edu/PROD/fhda_opencourses.P_Application',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
}
data = [('termcode', f'{term}'), ]
res = requests.post('https://banssb.fhda.edu/PROD/fhda_opencourses.P_GetCourseList',
headers=headers, data=data)
res.raise_for_status()
if write:
with open(f'{join(DB_DIR, SCHEDULE)}', "wb") as file:
for chunk in res.iter_content(chunk_size=512):
if chunk:
file.write(chunk)
return res.content
def parse(content, db):
'''
Parse takes the content from the request and then populates the database with the data
:param content: (html) The html containing the courses
:param db: (TinyDB) the current database
'''
soup = BeautifulSoup(content, 'html5lib')
tables = soup.find_all('table', {'class': 'TblCourses'})
for t in tables:
dept = t['dept'].replace(' ', '')
dept_desc = t['dept-desc']
rows = t.find_all('tr', {'class': 'CourseRow'})
s = defaultdict(lambda: defaultdict(list))
for r in rows:
cols = r.find_all(lambda tag: tag.name == 'td' and not tag.get_text().isspace())
if cols:
for i, c in enumerate(cols):
a = c.find('a')
cols[i] = a.get_text() if a else cols[i].get_text()
try:
key = get_key(f'{cols[0] if cols[0] else cols[1]}')[0]
data = dict(zip(HEADERS, cols))
crn = data['CRN']
if s[key][crn]:
comb = set(s[key][crn][0].items()) ^ set(data.items())
if not comb:
continue
data['units'] = data['units'].lstrip()
s[key][crn].append(data)
except KeyError:
continue
j = dict(s)
db.table(f'{dept}').insert(j)
def get_key(course):
'''
This is the key parser for the course names
:param course: (str) The unparsed string containing the course name
:return match_obj.groups(): (list) the string for the regex match
'''
c = course.split(' ')
idx = 1 if len(c) < 3 else 2
section = c[idx]
match_obj = match(COURSE_PATTERN, section)
return match_obj.groups()
if __name__ == "__main__":
main()