|
1 | | -# |
| 1 | +"""Record librarie data.""" |
| 2 | +# pylint: disable=C0301,W1201 |
| 3 | + |
| 4 | +from apiary.tasks import BaseApiaryTask |
| 5 | +import requests |
| 6 | +import logging |
| 7 | +import HTMLParser |
| 8 | +import re |
| 9 | +import semantic_version |
| 10 | + |
| 11 | + |
| 12 | +LOGGER = logging.getLogger() |
| 13 | + |
| 14 | +class RecordLibrariesTask(BaseApiaryTask): |
| 15 | + |
| 16 | + def run(self, site_id, sitename, api_url): |
| 17 | + """Get extensions from the website and write them to WikiApiary.""" |
| 18 | + |
| 19 | + LOGGER.info("Retrieve record_libraries for %d", site_id) |
| 20 | + |
| 21 | + data_url = api_url + '?action=query&meta=siteinfo&siprop=libraries&format=json' |
| 22 | + |
| 23 | + LOGGER.debug("Requesting from %s" % data_url) |
| 24 | + try: |
| 25 | + req = requests.get(data_url, timeout = 15, verify=False) |
| 26 | + data = req.json() |
| 27 | + except Exception, e: |
| 28 | + LOGGER.error(e) |
| 29 | + raise Exception(e) |
| 30 | + |
| 31 | + if req.status_code == 200: |
| 32 | + # Successfully pulled data |
| 33 | + if 'query' in data: |
| 34 | + # Looks like a valid response |
| 35 | + template_block = self.generate_template(data['query']['libraries']) |
| 36 | + wiki_return = self.bumble_bee.call({ |
| 37 | + 'action': 'edit', |
| 38 | + 'title': "%s/Libraries" % sitename, |
| 39 | + 'text': template_block, |
| 40 | + 'token': self.bumble_bee_token, |
| 41 | + 'bot': 'true' |
| 42 | + }) |
| 43 | + LOGGER.debug(wiki_return) |
| 44 | + |
| 45 | + if 'error' in wiki_return: |
| 46 | + raise Exception(wiki_return) |
| 47 | + |
| 48 | + return wiki_return |
| 49 | + else: |
| 50 | + self.record_error( |
| 51 | + site_id=site_id, |
| 52 | + sitename=sitename, |
| 53 | + log_message='Returned unexpected JSON when requesting librarie data.', |
| 54 | + log_type='warn', |
| 55 | + log_severity='normal', |
| 56 | + log_bot='Bumble Bee', |
| 57 | + log_url=data_url |
| 58 | + ) |
| 59 | + raise Exception('Returned unexpected JSON when requesting librarie data.') |
| 60 | + |
| 61 | + |
| 62 | + def generate_template(self, ext_obj): |
| 63 | + """Build a the wikitext for the extensions subpage.""" |
| 64 | + |
| 65 | + h = HTMLParser.HTMLParser() |
| 66 | + |
| 67 | + # Some keys we do not want to store in WikiApiary |
| 68 | + ignore_keys = ['descriptionmsg'] |
| 69 | + # Some keys we turn into more readable names for using inside of WikiApiary |
| 70 | + key_names = { |
| 71 | + 'author': 'Libraries author', |
| 72 | + 'name': 'Libraries name', |
| 73 | + 'version': 'Libraries version', |
| 74 | + 'type': 'Libraries type', |
| 75 | + 'url': 'Libraries URL' |
| 76 | + } |
| 77 | + |
| 78 | + template_block = "<noinclude>{{Libraries subpage}}</noinclude><includeonly>" |
| 79 | + |
| 80 | + for librarie in ext_obj: |
| 81 | + if 'name' in librarie: |
| 82 | + template_block += "{{Librarie in use\n" |
| 83 | + |
| 84 | + for item in librarie: |
| 85 | + if item not in ignore_keys: |
| 86 | + |
| 87 | + name = key_names.get(item, item) |
| 88 | + value = librarie[item] |
| 89 | + |
| 90 | + if item == 'name': |
| 91 | + # Sometimes people make the name of the extension a hyperlink using |
| 92 | + # wikitext links and this makes things ugly. So, let's detect that if present. |
| 93 | + if re.match(r'\[(http[^\s]+)\s+([^\]]+)\]', value): |
| 94 | + (possible_url, value) = re.findall(r'\[(http[^\s]+)\s+([^\]]+)\]', value)[0] |
| 95 | + # If a URL was given in the name, and not given as a formal part of the |
| 96 | + # extension definition (yes, this happens) then add this to the template |
| 97 | + # it is up to the template to decide what to do with this |
| 98 | + template_block += "|URL Embedded in name=%s" % possible_url |
| 99 | + |
| 100 | + value = self.filter_illegal_chars(value) |
| 101 | + # Before unescaping 'regular' unicode characters, first deal with spaces |
| 102 | + # because they cause problems when converted to unicode non-breaking spaces |
| 103 | + value = value.replace(' ', ' ').replace(' ', ' ').replace('&160;', ' ') |
| 104 | + value = h.unescape(value) |
| 105 | + |
| 106 | + if item == 'version': |
| 107 | + try: |
| 108 | + # Breakdown the version information for more detailed analysis |
| 109 | + version_details = semantic_version.Version(value, partial=True) |
| 110 | + if version_details.major is not None: |
| 111 | + template_block += "|Version major=%s\n" % version_details.major |
| 112 | + if version_details.minor is not None: |
| 113 | + template_block += "|Version minor=%s\n" % version_details.minor |
| 114 | + if version_details.patch is not None: |
| 115 | + template_block += "|Version patch=%s\n" % version_details.patch |
| 116 | + if version_details.prerelease is not None: |
| 117 | + prerelease_string = ','.join(version_details.prerelease) |
| 118 | + template_block += "|Version prerelease=%s\n" % prerelease_string |
| 119 | + if version_details.build is not None: |
| 120 | + build_string = ','.join(version_details.build) |
| 121 | + template_block += "|Version build=%s\n" % build_string |
| 122 | + except Exception, e: |
| 123 | + LOGGER.debug("Unable to parse version string %s (%s)" % (value, e)) |
| 124 | + |
| 125 | + if item == 'author': |
| 126 | + # Authors can have a lot of junk in them, wikitext and such. |
| 127 | + # We'll try to clean that up. |
| 128 | + |
| 129 | + # Wikilinks with names |
| 130 | + # "[[Foobar | Foo Bar]]" |
| 131 | + value = re.sub(r'\[\[.*\|(.*)\]\]', r'\1', value) |
| 132 | + # Simple Wikilinks |
| 133 | + value = re.sub(r'\[\[(.*)\]\]', r'\1', value) |
| 134 | + # Hyperlinks as wikiext |
| 135 | + # "[https://www.mediawiki.org/wiki/User:Jeroen_De_Dauw Jeroen De Dauw]" |
| 136 | + value = re.sub(r'\[\S+\s+([^\]]+)\]', r'\1', value) |
| 137 | + # Misc text |
| 138 | + value = re.sub(r'\sand\s', r', ', value) |
| 139 | + value = re.sub(r'\.\.\.', r'', value) |
| 140 | + value = re.sub(r' ', r' ', value) |
| 141 | + # Lastly, there could be HTML encoded stuff in these |
| 142 | + value = h.unescape(value) |
| 143 | + |
| 144 | + if item == 'url': |
| 145 | + # Seems some people really really love protocol agnostic URL's |
| 146 | + # We detect them and add a generic http: protocol to them |
| 147 | + if re.match(r'^\/\/', value): |
| 148 | + value = 'http:' + value |
| 149 | + |
| 150 | + template_block += "|%s=%s\n" % (name, value) |
| 151 | + |
| 152 | + template_block += "}}\n" |
| 153 | + |
| 154 | + template_block += "</includeonly>" |
| 155 | + |
| 156 | + return template_block |
| 157 | + |
0 commit comments