Skip to content

Commit 61695f9

Browse files
committed
tasks: new task to sync tennis stats
Change-Id: Ia7ee82eda72395f39c0192e24e6a3c70453fc84b
1 parent 9c7db79 commit 61695f9

File tree

3 files changed

+160
-0
lines changed

3 files changed

+160
-0
lines changed
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import json
2+
import logging
3+
import os
4+
import re
5+
from collections import defaultdict
6+
from dataclasses import dataclass
7+
from tempfile import NamedTemporaryFile
8+
from typing import Any, Dict, List, Optional, Tuple
9+
10+
import pypdf
11+
import requests
12+
13+
from majavahbot.tasks import Task, task_registry
14+
15+
LOGGER = logging.getLogger(__name__)
16+
17+
SINGLES_PDF_URL = (
18+
"https://www.protennislive.com/posting/ramr/singles_entry_numerical.pdf"
19+
)
20+
21+
ROW_RE = re.compile(
22+
r"^(?P<rank>\d+)(?P<tied> *T?) +(?P<name>[^(0-9]+)(?P<country> \([A-Z]{3}\))? (?P<points>\d+)"
23+
)
24+
25+
26+
@dataclass(frozen=True)
27+
class Ranking:
28+
name: str
29+
country: str
30+
points: int
31+
rank: int
32+
tied: bool
33+
34+
35+
class SyncTennisStatsTask(Task):
36+
def __init__(self, number, name, site, family):
37+
super().__init__(number, name, site, family)
38+
self.register_task_configuration("User:MajavahBot/ATP rankings updater")
39+
self.merge_task_configuration(
40+
enable=True,
41+
summary="Bot: Updating rankings data",
42+
singles_result="Module:ATP rankings/data/singles.json",
43+
)
44+
45+
def remap_country(self, name: str, country: Optional[str]) -> str:
46+
# TODO: do something for Russia etc where the official stats don't have a country
47+
48+
if not country:
49+
return ""
50+
51+
return country.lstrip(" (").rstrip(")")
52+
53+
def download_and_parse(self, url: str) -> Tuple[List[Ranking], Optional[str]]:
54+
f = None
55+
try:
56+
with NamedTemporaryFile(delete=False) as f:
57+
LOGGER.info("Downloading stats PDF from %s", url)
58+
response = requests.get(url)
59+
response.raise_for_status()
60+
f.write(response.content)
61+
62+
LOGGER.info("Parsing the downloaded PDF")
63+
parser = pypdf.PdfReader(f.name)
64+
finally:
65+
if f:
66+
os.unlink(f.name)
67+
68+
LOGGER.info("Extracting ranking data")
69+
update_date = ""
70+
players: List[Ranking] = []
71+
72+
for page in parser.pages:
73+
text = page.extract_text()
74+
75+
if update_date == "" and "Report as of" in text:
76+
update_date = text[
77+
text.find("Report as of") + len("Report as of") :
78+
].strip()
79+
80+
for row in text.split("\n"):
81+
match = ROW_RE.match(row)
82+
if not match:
83+
continue
84+
players.append(
85+
Ranking(
86+
name=match.group("name"),
87+
country=self.remap_country(
88+
match.group("name"), match.group("country")
89+
),
90+
rank=int(match.group("rank")),
91+
tied=match.group("tied").strip() != "",
92+
points=int(match.group("points")),
93+
)
94+
)
95+
96+
return players, update_date
97+
98+
def process_pdf(self, url: str, target_page: str):
99+
players, update_date = self.download_and_parse(url)
100+
LOGGER.info("Formatting rankings for the required on-wiki format")
101+
102+
per_country: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
103+
104+
for player in sorted(players, key=lambda p: p.rank):
105+
if player.country == "":
106+
continue
107+
if len(per_country[player.country]) >= 15:
108+
# Template shows top 10, but sync top 15 to provide enough change information
109+
continue
110+
111+
per_country[player.country].append(
112+
{
113+
"name": player.name,
114+
"rank": player.rank,
115+
"tied": player.tied,
116+
"points": player.points,
117+
}
118+
)
119+
120+
new_data = {
121+
"per-country": per_country,
122+
"as-of": update_date,
123+
}
124+
125+
previous_data = new_data
126+
page = self.get_mediawiki_api().get_page(target_page)
127+
if page.exists():
128+
current_version = json.loads(page.get())
129+
if current_version["current"]["as-of"] != new_data["as-of"]:
130+
previous_data = current_version["current"]
131+
else:
132+
previous_data = current_version["previous"]
133+
134+
page.text = json.dumps(
135+
{
136+
"current": new_data,
137+
"previous": previous_data,
138+
},
139+
sort_keys=True,
140+
)
141+
142+
LOGGER.info("Updating on-wiki JSON page")
143+
144+
page.save(self.get_task_configuration("summary"))
145+
146+
def run(self):
147+
if self.get_task_configuration("enable") is not True:
148+
LOGGER.error("Disabled in configuration")
149+
return
150+
151+
self.process_pdf(SINGLES_PDF_URL, self.get_task_configuration("singles_result"))
152+
153+
154+
task_registry.add_task(
155+
SyncTennisStatsTask(
156+
"sync-tennis-stats", "Tennis statistics sync", "en", "wikipedia"
157+
)
158+
)

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
"dateparser",
1414
"mwparserfromhell",
1515
"pymysql",
16+
"pypdf",
1617
"pywikibot",
1718
"sseclient",
1819
"requests",

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ deps =
2222
types-dateparser
2323
types-PyMySQL
2424
types-python-dateutil
25+
types-requests

0 commit comments

Comments
 (0)