|
| 1 | +import json |
| 2 | +import logging |
| 3 | +import os |
| 4 | +import re |
| 5 | +from collections import defaultdict |
| 6 | +from dataclasses import dataclass |
| 7 | +from tempfile import NamedTemporaryFile |
| 8 | +from typing import Any, Dict, List, Optional, Tuple |
| 9 | + |
| 10 | +import pypdf |
| 11 | +import requests |
| 12 | + |
| 13 | +from majavahbot.tasks import Task, task_registry |
| 14 | + |
| 15 | +LOGGER = logging.getLogger(__name__) |
| 16 | + |
| 17 | +SINGLES_PDF_URL = ( |
| 18 | + "https://www.protennislive.com/posting/ramr/singles_entry_numerical.pdf" |
| 19 | +) |
| 20 | + |
| 21 | +ROW_RE = re.compile( |
| 22 | + r"^(?P<rank>\d+)(?P<tied> *T?) +(?P<name>[^(0-9]+)(?P<country> \([A-Z]{3}\))? (?P<points>\d+)" |
| 23 | +) |
| 24 | + |
| 25 | + |
| 26 | +@dataclass(frozen=True) |
| 27 | +class Ranking: |
| 28 | + name: str |
| 29 | + country: str |
| 30 | + points: int |
| 31 | + rank: int |
| 32 | + tied: bool |
| 33 | + |
| 34 | + |
| 35 | +class SyncTennisStatsTask(Task): |
| 36 | + def __init__(self, number, name, site, family): |
| 37 | + super().__init__(number, name, site, family) |
| 38 | + self.register_task_configuration("User:MajavahBot/ATP rankings updater") |
| 39 | + self.merge_task_configuration( |
| 40 | + enable=True, |
| 41 | + summary="Bot: Updating rankings data", |
| 42 | + singles_result="Module:ATP rankings/data/singles.json", |
| 43 | + ) |
| 44 | + |
| 45 | + def remap_country(self, name: str, country: Optional[str]) -> str: |
| 46 | + # TODO: do something for Russia etc where the official stats don't have a country |
| 47 | + |
| 48 | + if not country: |
| 49 | + return "" |
| 50 | + |
| 51 | + return country.lstrip(" (").rstrip(")") |
| 52 | + |
| 53 | + def download_and_parse(self, url: str) -> Tuple[List[Ranking], Optional[str]]: |
| 54 | + f = None |
| 55 | + try: |
| 56 | + with NamedTemporaryFile(delete=False) as f: |
| 57 | + LOGGER.info("Downloading stats PDF from %s", url) |
| 58 | + response = requests.get(url) |
| 59 | + response.raise_for_status() |
| 60 | + f.write(response.content) |
| 61 | + |
| 62 | + LOGGER.info("Parsing the downloaded PDF") |
| 63 | + parser = pypdf.PdfReader(f.name) |
| 64 | + finally: |
| 65 | + if f: |
| 66 | + os.unlink(f.name) |
| 67 | + |
| 68 | + LOGGER.info("Extracting ranking data") |
| 69 | + update_date = "" |
| 70 | + players: List[Ranking] = [] |
| 71 | + |
| 72 | + for page in parser.pages: |
| 73 | + text = page.extract_text() |
| 74 | + |
| 75 | + if update_date == "" and "Report as of" in text: |
| 76 | + update_date = text[ |
| 77 | + text.find("Report as of") + len("Report as of") : |
| 78 | + ].strip() |
| 79 | + |
| 80 | + for row in text.split("\n"): |
| 81 | + match = ROW_RE.match(row) |
| 82 | + if not match: |
| 83 | + continue |
| 84 | + players.append( |
| 85 | + Ranking( |
| 86 | + name=match.group("name"), |
| 87 | + country=self.remap_country( |
| 88 | + match.group("name"), match.group("country") |
| 89 | + ), |
| 90 | + rank=int(match.group("rank")), |
| 91 | + tied=match.group("tied").strip() != "", |
| 92 | + points=int(match.group("points")), |
| 93 | + ) |
| 94 | + ) |
| 95 | + |
| 96 | + return players, update_date |
| 97 | + |
| 98 | + def process_pdf(self, url: str, target_page: str): |
| 99 | + players, update_date = self.download_and_parse(url) |
| 100 | + LOGGER.info("Formatting rankings for the required on-wiki format") |
| 101 | + |
| 102 | + per_country: Dict[str, List[Dict[str, Any]]] = defaultdict(list) |
| 103 | + |
| 104 | + for player in sorted(players, key=lambda p: p.rank): |
| 105 | + if player.country == "": |
| 106 | + continue |
| 107 | + if len(per_country[player.country]) >= 15: |
| 108 | + # Template shows top 10, but sync top 15 to provide enough change information |
| 109 | + continue |
| 110 | + |
| 111 | + per_country[player.country].append( |
| 112 | + { |
| 113 | + "name": player.name, |
| 114 | + "rank": player.rank, |
| 115 | + "tied": player.tied, |
| 116 | + "points": player.points, |
| 117 | + } |
| 118 | + ) |
| 119 | + |
| 120 | + new_data = { |
| 121 | + "per-country": per_country, |
| 122 | + "as-of": update_date, |
| 123 | + } |
| 124 | + |
| 125 | + previous_data = new_data |
| 126 | + page = self.get_mediawiki_api().get_page(target_page) |
| 127 | + if page.exists(): |
| 128 | + current_version = json.loads(page.get()) |
| 129 | + if current_version["current"]["as-of"] != new_data["as-of"]: |
| 130 | + previous_data = current_version["current"] |
| 131 | + else: |
| 132 | + previous_data = current_version["previous"] |
| 133 | + |
| 134 | + page.text = json.dumps( |
| 135 | + { |
| 136 | + "current": new_data, |
| 137 | + "previous": previous_data, |
| 138 | + }, |
| 139 | + sort_keys=True, |
| 140 | + ) |
| 141 | + |
| 142 | + LOGGER.info("Updating on-wiki JSON page") |
| 143 | + |
| 144 | + page.save(self.get_task_configuration("summary")) |
| 145 | + |
| 146 | + def run(self): |
| 147 | + if self.get_task_configuration("enable") is not True: |
| 148 | + LOGGER.error("Disabled in configuration") |
| 149 | + return |
| 150 | + |
| 151 | + self.process_pdf(SINGLES_PDF_URL, self.get_task_configuration("singles_result")) |
| 152 | + |
| 153 | + |
| 154 | +task_registry.add_task( |
| 155 | + SyncTennisStatsTask( |
| 156 | + "sync-tennis-stats", "Tennis statistics sync", "en", "wikipedia" |
| 157 | + ) |
| 158 | +) |
0 commit comments