Skip to content
This repository was archived by the owner on Feb 7, 2024. It is now read-only.

Commit 7f503b0

Browse files
committed
Updated scraper to match new data model and added purge
1 parent 4590b58 commit 7f503b0

File tree

3 files changed

+57
-34
lines changed

3 files changed

+57
-34
lines changed

scripts/lib/models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ class ScrapeRecord(Base):
2525
__table__ = Base.metadata.tables['w8y_scrape_records']
2626

2727

28+
class VersionRecord(Base):
29+
__table__ = Base.metadata.tables['w8y_version_records']
30+
31+
2832
class Skin(Base):
2933
__table__ = Base.metadata.tables['w8y_skins']
3034

scripts/lib/scraper.py

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from models import ScrapeRecord, Skin, Extension
1+
from models import ScrapeRecord, VersionRecord, Skin, Extension
22
from utils import log_message
33
import json
44
import re
@@ -32,11 +32,12 @@ def get_siteinfo(url, args, session):
3232
return None
3333

3434

35-
def create_version_records(components):
35+
def create_version_records(session, last_sr_id, components):
3636
skins = []
3737
skin_names = []
3838
extensions = []
3939
extension_names = []
40+
4041
for comp in components:
4142
if 'name' in comp and 'type' in comp:
4243
name = bytes(comp['name'], 'utf-8')
@@ -67,13 +68,36 @@ def create_version_records(components):
6768
'url': url
6869
})
6970
extension_names.append(name)
70-
return {
71-
'skins': skins,
72-
'extensions': extensions
73-
}
71+
72+
version_record = VersionRecord()
73+
session.add(version_record)
74+
session.commit()
75+
vr_id = version_record.w8y_vr_vr_id
76+
77+
for skin in skins:
78+
session.add(
79+
Skin(
80+
w8y_sk_vr_id=vr_id,
81+
w8y_sk_name=skin['name'],
82+
w8y_sk_version=skin['version'],
83+
w8y_sk_doc_url=skin['url']
84+
)
85+
)
86+
87+
for extension in extensions:
88+
session.add(
89+
Extension(
90+
w8y_ex_vr_id=vr_id,
91+
w8y_ex_name=extension['name'],
92+
w8y_ex_version=extension['version'],
93+
w8y_ex_doc_url=extension['url']
94+
)
95+
)
96+
97+
return vr_id
7498

7599

76-
def scrape_site(url, page_id, args, session):
100+
def scrape_site(url, page_id, last_sr_id, args, session):
77101
data = get_siteinfo(url, args, session)
78102
timestamp = time.time()
79103

@@ -113,19 +137,16 @@ def scrape_site(url, page_id, args, session):
113137
statistics = query['statistics']
114138

115139
if 'extensions' in query:
116-
extensions = query['extensions']
117-
versions = create_version_records(extensions)
140+
vr_id = create_version_records(session, last_sr_id, query['extensions'])
118141
else:
119-
versions = {
120-
'skins': [],
121-
'extensions': []
122-
}
142+
vr_id = None
123143

124144
scrape = ScrapeRecord(
125145
w8y_sr_page_id=page_id,
126146
w8y_sr_api_url=bytes(url, 'utf-8'),
127147
w8y_sr_timestamp=timestamp,
128148
w8y_sr_is_alive=True,
149+
w8y_sr_vr_id=vr_id,
129150
w8y_sr_mw_version=bytes(mw_version, 'utf-8'),
130151
w8y_sr_db_version=bytes(db_version, 'utf-8'),
131152
w8y_sr_php_version=bytes(php_version, 'utf-8'),
@@ -138,24 +159,4 @@ def scrape_site(url, page_id, args, session):
138159
session.add(scrape)
139160
session.commit()
140161
sr_id = scrape.w8y_sr_sr_id
141-
142-
for skin in versions['skins']:
143-
session.add(
144-
Skin(
145-
w8y_sk_sr_id=sr_id,
146-
w8y_sk_name=skin['name'],
147-
w8y_sk_version=skin['version'],
148-
w8y_sk_doc_url=skin['url']
149-
)
150-
)
151-
for extension in versions['extensions']:
152-
session.add(
153-
Extension(
154-
w8y_ex_sr_id=sr_id,
155-
w8y_ex_name=extension['name'],
156-
w8y_ex_version=extension['version'],
157-
w8y_ex_doc_url=extension['url']
158-
)
159-
)
160-
161162
return sr_id, False

scripts/scrape.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from argparse import ArgumentParser
22
from sqlalchemy.orm import Session
33
from sqlalchemy import select, and_
4+
import pywikibot
45
import sys
56
import time
67
sys.path.append('./lib')
@@ -33,11 +34,27 @@ def get_wikis(session, new_wikis):
3334
return session.scalars(stmt)
3435

3536

37+
def purge(site, session, args, pageids):
38+
params = {
39+
'action': 'purge',
40+
'pageids': pageids
41+
}
42+
request = pywikibot.data.api.Request(site=site, parameters=params)
43+
result = request.submit()
44+
if 'purge' in result:
45+
for page in result['purge']:
46+
if 'title' in page:
47+
message = 'Purged page %s' % page['title']
48+
log_message(session, message)
49+
if args.verbose > 1:
50+
print(message)
51+
3652
def run():
3753
args = get_args()
3854
start_time = time.time()
3955
good_count = 0
4056
error_count = 0
57+
site = pywikibot.Site()
4158
with Session(engine) as session:
4259
try:
4360
wikis = get_wikis(session, args.new)
@@ -59,14 +76,15 @@ def run():
5976
if args.verbose > 1:
6077
message = f'Scraping {url}'
6178
print(message)
62-
(sr_id, error) = scrape_site(url, page_id, args, session)
79+
(sr_id, error) = scrape_site(url, page_id, wiki.w8y_wi_last_sr_id, args, session)
6380
wiki.w8y_wi_last_sr_id = sr_id
6481
session.add(wiki)
6582
session.commit()
6683
if error:
6784
error_count += 1
6885
else:
6986
good_count += 1
87+
purge(site, session, args, page_id)
7088
except KeyboardInterrupt:
7189
session.rollback()
7290
finally:

0 commit comments

Comments
 (0)