Skip to content

Commit 4be0bf2

Browse files
committed
Update libraries.py
1 parent b89dea5 commit 4be0bf2

File tree

1 file changed

+157
-1
lines changed

1 file changed

+157
-1
lines changed

apiary/tasks/website/libraries.py

Lines changed: 157 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,157 @@
1-
#
1+
"""Record librarie data."""
2+
# pylint: disable=C0301,W1201
3+
4+
from apiary.tasks import BaseApiaryTask
5+
import requests
6+
import logging
7+
import HTMLParser
8+
import re
9+
import semantic_version
10+
11+
12+
LOGGER = logging.getLogger()
13+
14+
class RecordLibrariesTask(BaseApiaryTask):
15+
16+
def run(self, site_id, sitename, api_url):
17+
"""Get extensions from the website and write them to WikiApiary."""
18+
19+
LOGGER.info("Retrieve record_libraries for %d", site_id)
20+
21+
data_url = api_url + '?action=query&meta=siteinfo&siprop=libraries&format=json'
22+
23+
LOGGER.debug("Requesting from %s" % data_url)
24+
try:
25+
req = requests.get(data_url, timeout = 15, verify=False)
26+
data = req.json()
27+
except Exception, e:
28+
LOGGER.error(e)
29+
raise Exception(e)
30+
31+
if req.status_code == 200:
32+
# Successfully pulled data
33+
if 'query' in data:
34+
# Looks like a valid response
35+
template_block = self.generate_template(data['query']['libraries'])
36+
wiki_return = self.bumble_bee.call({
37+
'action': 'edit',
38+
'title': "%s/Libraries" % sitename,
39+
'text': template_block,
40+
'token': self.bumble_bee_token,
41+
'bot': 'true'
42+
})
43+
LOGGER.debug(wiki_return)
44+
45+
if 'error' in wiki_return:
46+
raise Exception(wiki_return)
47+
48+
return wiki_return
49+
else:
50+
self.record_error(
51+
site_id=site_id,
52+
sitename=sitename,
53+
log_message='Returned unexpected JSON when requesting librarie data.',
54+
log_type='warn',
55+
log_severity='normal',
56+
log_bot='Bumble Bee',
57+
log_url=data_url
58+
)
59+
raise Exception('Returned unexpected JSON when requesting librarie data.')
60+
61+
62+
def generate_template(self, ext_obj):
63+
"""Build a the wikitext for the extensions subpage."""
64+
65+
h = HTMLParser.HTMLParser()
66+
67+
# Some keys we do not want to store in WikiApiary
68+
ignore_keys = ['descriptionmsg']
69+
# Some keys we turn into more readable names for using inside of WikiApiary
70+
key_names = {
71+
'author': 'Libraries author',
72+
'name': 'Libraries name',
73+
'version': 'Libraries version',
74+
'type': 'Libraries type',
75+
'url': 'Libraries URL'
76+
}
77+
78+
template_block = "<noinclude>{{Libraries subpage}}</noinclude><includeonly>"
79+
80+
for librarie in ext_obj:
81+
if 'name' in librarie:
82+
template_block += "{{Librarie in use\n"
83+
84+
for item in librarie:
85+
if item not in ignore_keys:
86+
87+
name = key_names.get(item, item)
88+
value = librarie[item]
89+
90+
if item == 'name':
91+
# Sometimes people make the name of the extension a hyperlink using
92+
# wikitext links and this makes things ugly. So, let's detect that if present.
93+
if re.match(r'\[(http[^\s]+)\s+([^\]]+)\]', value):
94+
(possible_url, value) = re.findall(r'\[(http[^\s]+)\s+([^\]]+)\]', value)[0]
95+
# If a URL was given in the name, and not given as a formal part of the
96+
# extension definition (yes, this happens) then add this to the template
97+
# it is up to the template to decide what to do with this
98+
template_block += "|URL Embedded in name=%s" % possible_url
99+
100+
value = self.filter_illegal_chars(value)
101+
# Before unescaping 'regular' unicode characters, first deal with spaces
102+
# because they cause problems when converted to unicode non-breaking spaces
103+
value = value.replace('&nbsp;', ' ').replace('&#160;', ' ').replace('&160;', ' ')
104+
value = h.unescape(value)
105+
106+
if item == 'version':
107+
try:
108+
# Breakdown the version information for more detailed analysis
109+
version_details = semantic_version.Version(value, partial=True)
110+
if version_details.major is not None:
111+
template_block += "|Version major=%s\n" % version_details.major
112+
if version_details.minor is not None:
113+
template_block += "|Version minor=%s\n" % version_details.minor
114+
if version_details.patch is not None:
115+
template_block += "|Version patch=%s\n" % version_details.patch
116+
if version_details.prerelease is not None:
117+
prerelease_string = ','.join(version_details.prerelease)
118+
template_block += "|Version prerelease=%s\n" % prerelease_string
119+
if version_details.build is not None:
120+
build_string = ','.join(version_details.build)
121+
template_block += "|Version build=%s\n" % build_string
122+
except Exception, e:
123+
LOGGER.debug("Unable to parse version string %s (%s)" % (value, e))
124+
125+
if item == 'author':
126+
# Authors can have a lot of junk in them, wikitext and such.
127+
# We'll try to clean that up.
128+
129+
# Wikilinks with names
130+
# "[[Foobar | Foo Bar]]"
131+
value = re.sub(r'\[\[.*\|(.*)\]\]', r'\1', value)
132+
# Simple Wikilinks
133+
value = re.sub(r'\[\[(.*)\]\]', r'\1', value)
134+
# Hyperlinks as wikiext
135+
# "[https://www.mediawiki.org/wiki/User:Jeroen_De_Dauw Jeroen De Dauw]"
136+
value = re.sub(r'\[\S+\s+([^\]]+)\]', r'\1', value)
137+
# Misc text
138+
value = re.sub(r'\sand\s', r', ', value)
139+
value = re.sub(r'\.\.\.', r'', value)
140+
value = re.sub(r'&nbsp;', r' ', value)
141+
# Lastly, there could be HTML encoded stuff in these
142+
value = h.unescape(value)
143+
144+
if item == 'url':
145+
# Seems some people really really love protocol agnostic URL's
146+
# We detect them and add a generic http: protocol to them
147+
if re.match(r'^\/\/', value):
148+
value = 'http:' + value
149+
150+
template_block += "|%s=%s\n" % (name, value)
151+
152+
template_block += "}}\n"
153+
154+
template_block += "</includeonly>"
155+
156+
return template_block
157+

0 commit comments

Comments
 (0)