Skip to content
This repository was archived by the owner on Feb 7, 2024. It is now read-only.

Commit ced8d22

Browse files
committed
More error handling
1 parent cbf89ff commit ced8d22

File tree

3 files changed

+32
-10
lines changed

3 files changed

+32
-10
lines changed

scripts/lib/scraper.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,16 @@ def scrape_site(url, page_id, args, session):
9292

9393
general = query['general']
9494
mw_version = re.sub('^MediaWiki ', '', general['generator'])
95-
db_version = general['dbtype'] + ': ' + general['dbversion']
96-
php_version = general['phpversion'] + '(' + general['phpsapi'] + ')'
95+
if 'dbtype' in general and 'dbversion' in general:
96+
db_version = general['dbtype'] + ': ' + general['dbversion']
97+
else:
98+
db_version = ""
99+
if 'phpversion' in general and 'phpsapi' in general:
100+
php_version = general['phpversion'] + '(' + general['phpsapi'] + ')'
101+
else:
102+
php_version = ""
97103
language = general['lang']
98-
if 'logo' in general:
104+
if 'logo' in general and len(general['logo']) < 256:
99105
logo = general['logo']
100106
else:
101107
logo = ''
@@ -110,7 +116,10 @@ def scrape_site(url, page_id, args, session):
110116
extensions = query['extensions']
111117
versions = create_version_records(extensions)
112118
else:
113-
versions = None
119+
versions = {
120+
'skins': [],
121+
'extensions': []
122+
}
114123

115124
scrape = ScrapeRecord(
116125
w8y_sr_page_id=page_id,

scripts/scrape.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from argparse import ArgumentParser
22
from sqlalchemy.orm import Session
3-
from sqlalchemy import select
3+
from sqlalchemy import select, and_
44
import sys
55
import time
66
sys.path.append('./lib')
@@ -10,13 +10,26 @@
1010

1111

1212
def get_args():
13-
parser = ArgumentParser(prog='Create', description='creates pages in wiki corresponding to URLs in file')
13+
parser = ArgumentParser(prog='Create', description='scrapes the data for rows in Wikis and stores the data in the database')
14+
parser.add_argument("-n", "--new", action="store_true", help="scrape only new pages (those that have not been scraped before)")
1415
parser.add_argument("-v", "--verbose", action="count", default=0, help="increase output verbosity")
1516
return parser.parse_args()
1617

1718

18-
def get_wikis(session):
19-
stmt = select(Wiki).where(Wiki.w8y_wi_is_defunct == False)
19+
def get_wikis(session, new_wikis):
20+
if new_wikis:
21+
stmt = select(Wiki).where(
22+
and_(
23+
Wiki.w8y_wi_last_sr_id.is_not(None),
24+
Wiki.w8y_wi_is_defunct == False
25+
)
26+
)
27+
else:
28+
stmt = select(Wiki).where(
29+
Wiki.w8y_wi_is_defunct == False
30+
).order_by(
31+
Wiki.w8y_wi_last_sr_id
32+
)
2033
return session.scalars(stmt)
2134

2235

@@ -27,7 +40,7 @@ def run():
2740
error_count = 0
2841
with Session(engine) as session:
2942
try:
30-
wikis = get_wikis(session)
43+
wikis = get_wikis(session, args.new)
3144
message = 'Starting scraping wikis.'
3245
log_message(session, message)
3346
if args.verbose:

scripts/sync_pages_to_db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
def get_args():
14-
parser = ArgumentParser(prog='Create', description='creates pages in wiki corresponding to URLs in file')
14+
parser = ArgumentParser(prog='Create', description='creates Wiki records in the database corresponding to the pages that were created in the wiki')
1515
parser.add_argument("-v", "--verbose", action="count", default=0, help="increase output verbosity")
1616
return parser.parse_args()
1717

0 commit comments

Comments
 (0)