Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pylint = "*"
colorama = "*"
snapshottest = "*"
click = "*"
marshmallow = "*"

[dev-packages]

Expand Down
53 changes: 9 additions & 44 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion db/test_database.json

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion emulate_login.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,10 @@ def login(session):
('_eventId_proceed', ''),
]
except KeyError:
print('[ERROR] Login - username or password not specified. Use the env variables MP_USER and MP_PASS.\n')
print(
'[ERROR] Login - username or password not specified. ' +
'Use the env variables MP_USER and MP_PASS.\n'
)
raise
else:
# Subsequent HTML pages have an autosubmitting <form>
Expand Down
89 changes: 89 additions & 0 deletions owl_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from datetime import datetime
from marshmallow import Schema, fields, validate, validates, ValidationError, EXCLUDE

class ClassDataSchema(Schema):
Copy link
Copy Markdown
Member Author

@madhavarshney madhavarshney Jul 29, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO:

  • Add descriptions
  • Update docs

"""
Class Strings
"""
# 5-digit Course Reference Number (ex. 25668)
CRN = fields.Int(required=True)
# Raw course string (ex. "MATH F001D.01Z")
raw_course = fields.Str(required=True)
# Department (ex. "CIS" or "MATH")
dept = fields.Str(required=True)
# Course (ex. "1A" or "31D")
course = fields.Str(required=True)
# Class section (ex. "01Z")
section = fields.Str()
# Class variant (ex. "Z")
variant = fields.Str(validate=validate.OneOf(['', 'W', 'Z', 'Y', 'H']))

"""
Course Info
"""
# Description
desc = fields.Str(required=True)
# Class units
units = fields.Float(required=True, min=0)

"""
Class Dates
"""
# Start date
start = fields.Str(required=True)
# End date
end = fields.Str(required=True)

"""
Seat info
"""
# Class status (Open, Waitlist, Full)
status = fields.Str(required=True, validate=validate.OneOf(['open', 'waitlist', 'full']))
# Number of open seats
seats = fields.Int(required=True, min=0)
# Number of open waitlist seats
wait_seats = fields.Int(required=True, min=0)
# Waitlist capacity (total # of waitlist seats)
wait_cap = fields.Int(required=True, min=0)

class Meta:
ordered = True
unknown = EXCLUDE

@validates('start')
def validate_start(self, date_str):
self.validate_date(date_str)

@validates('end')
def validate_end(self, date_str):
self.validate_date(date_str)

def validate_date(self, date_str):
"""
Validate the date string format
"""
try:
datetime.strptime(date_str, '%m/%d/%Y')
except ValueError:
raise ValidationError('Date must be in the format %m/%d/%Y.')


class ClassTimeSchema(Schema):
days = fields.Str(required=True)
time = fields.Str(required=True)
room = fields.Str(required=True)
instructor = fields.Str(required=True)
campus = fields.Str(required=True, validate=validate.OneOf(['FH', 'FC', 'FO', 'DA', 'DO', '']))

class Meta:
ordered = True
unknown = EXCLUDE


class InterimClassDataSchema(ClassDataSchema, ClassTimeSchema):
pass


classDataSchema = ClassDataSchema()
classTimeSchema = ClassTimeSchema()
interimClassDataSchema = InterimClassDataSchema()
4 changes: 2 additions & 2 deletions scrape_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
('sel_insm', 'dummy'), ('sel_camp', 'dummy'), ('sel_levl', 'dummy'),
('sel_sess', 'dummy'), ('sel_instr', 'dummy'), ('sel_ptrm', 'dummy'),
('sel_attr', 'dummy')],
[('sel_crse', ''), ('sel_title', ''), ('sel_schd', '%'),
[('sel_crse', ''), ('sel_title', ''), ('sel_schd', '%'),
('sel_from_cred', ''), ('sel_to_cred', ''), ('sel_camp', '%'),
('sel_instr', '%'), ('sel_sess', '%'), ('sel_ptrm', '%'),
('sel_attr', '%'), ('begin_hh', '0'), ('begin_mi', '0'),
Expand Down Expand Up @@ -75,7 +75,7 @@ def main():
if DEBUG:
codes = codes[:5]

print_c(f'Scraping session cookie…\r')
print_c('Scraping session cookie…\r')

session = requests.session()
login(session)
Expand Down
60 changes: 50 additions & 10 deletions scrape_term.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from os import makedirs, rename, remove
from os import makedirs
from os.path import join, exists
from collections import defaultdict

# 3rd party
import requests
from bs4 import BeautifulSoup
from tinydb import TinyDB
from tinydb.storages import MemoryStorage
from marshmallow import ValidationError as MarshValidationError

from owl_models import interimClassDataSchema, classDataSchema, classTimeSchema
from utils import parse_course_str, ValidationError, log_info, log_err
from settings import DB_DIR, SSB_URL, HEADERS

Expand All @@ -18,22 +21,21 @@ def main():
makedirs(DB_DIR, exist_ok=True)

for term in CURRENT_TERM_CODES.values():
temp_path = join(DB_DIR, 'temp.json')
temp = TinyDB(temp_path)
temp = TinyDB(storage=MemoryStorage)

content = mine(term)
parse(content, db=temp)

if rename(temp_path, join(DB_DIR, f'{term}_database.json')):
remove(temp_path)

db = TinyDB(join(DB_DIR, f'{term}_database.json'))
db.storage.write(temp.storage.read())

depts = ', '.join(db.tables())
log_info(f'Scraped term {term}', pad=False, details={
'depts': depts,
})

db.close()


def mine(term, filename=None):
'''
Expand Down Expand Up @@ -87,6 +89,7 @@ def parse(content, db):
try:
parsed_course = parse_course_str(cols[0])
key = parsed_course['course']
section = parsed_course['section']
data = dict(zip(HEADERS, cols))

if parsed_course['dept'] != dept:
Expand All @@ -95,14 +98,24 @@ def parse(content, db):
f"'{parsed_course['dept']}' != '{dept}'"
)

data['dept'] = dept
data['course'] = key
data['section'] = section
data['status'] = data['status'].lower()
data['units'] = data['units'].lstrip()

try:
data = interimClassDataSchema.load(data)
except MarshValidationError as e:
print(e.messages, data)
continue

crn = data['CRN']
if s[key][crn]:
comb = set(s[key][crn][0].items()) ^ set(data.items())
if not comb:
continue

data['units'] = data['units'].lstrip()

s[key][crn].append(data)
except KeyError:
continue
Expand All @@ -115,8 +128,35 @@ def parse(content, db):
print('\n')
continue

j = dict(s)
db.table(f'{dept}').insert(j)
j = defaultdict(defaultdict)

for course, section in s.items():
for cl in section.values():
data = classDataSchema.load(cl[0])
classTime = [classTimeSchema.load(c) for c in cl]

check_integrity(cl, data, classTime)

data['time'] = classTime
j[course][data['CRN']] = data

db.table(f'{dept}').insert(dict(j))


def check_integrity(cl, data, times):
dataSets = [set(classDataSchema.load(c).items()) for c in cl]
timeSets = [set(t.items()) for t in times]

if len(cl) > 1:
for i in range(len(cl) - 1):
comb = dataSets[i] ^ dataSets[i + 1]
timeComb = timeSets[i] ^ timeSets[i + 1]

if comb:
print(data['CRN'], len(dataSets), comb)

# if not timeComb:
# print(data['CRN'], timeComb, comb)


if __name__ == "__main__":
Expand Down
Loading