Skip to content

Commit 9edb72a

Browse files
RickiJay-WMDErti
andauthored
feat: Export CSV (#110)
* Proof of Concept * Split * Split * Pylint * Quantity * Checked * index_col * Quantity II * Software Version * Recent Changes * Rename * Util * Export Directory * Specify * Test * authenticate * Test Output * Multiple Records Error * Fix Tests * Unused * Cleanup * Doc * Auth II * Mixed * fix: csv export without tmp file (#115) * fix: csv export without tmp file * fix: do not use StreamingResponse * Omit AssertionError Co-authored-by: Robert Timm <rti@users.noreply.github.com> --------- Co-authored-by: Robert Timm <rti@users.noreply.github.com>
1 parent c2080a5 commit 9edb72a

File tree

10 files changed

+454
-10
lines changed

10 files changed

+454
-10
lines changed

app.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
"""Main Application"""
22

33
from contextlib import asynccontextmanager
4+
from typing import Optional
45
from fastapi import FastAPI
56
from fastapi.middleware.cors import CORSMiddleware
7+
from fastapi.responses import PlainTextResponse
68
from strawberry.fastapi import GraphQLRouter
79

10+
from export_csv import export_metric_csv
811
from model.strawberry import schema
12+
from resolvers.authentication import authenticate_token
913
from schedule import scheduler
1014

1115

@@ -41,3 +45,18 @@ def read_root():
4145

4246

4347
app.include_router(GraphQLRouter(schema=schema), prefix="/graphql")
48+
49+
50+
@app.get("/csv/metrics")
51+
async def metric_csv(authorization: Optional[str] = None):
52+
"""Quantity CSV"""
53+
54+
if authorization is None:
55+
return PlainTextResponse("Authorization Missing", 403)
56+
57+
try:
58+
authenticate_token(authorization)
59+
except AssertionError:
60+
return PlainTextResponse("Authorization Failed", 403)
61+
62+
return await export_metric_csv()

docs/metrics_csv.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Metrics CSV
2+
3+
1 row per Wikibase, with the following columns:
4+
5+
- `wikibase_id`
6+
- internal identifier
7+
- automatically assigned upon adding to database
8+
- `wikibase_type`
9+
- manually set
10+
- CSV excludes TEST wikibases
11+
- `base_url`
12+
- manually set
13+
14+
From latest successful Quantity observation:
15+
16+
- `quantity_observation_date`
17+
- date of quantity observation
18+
- automatically set when Scraper attempts to fetch data
19+
- `total_items`
20+
- SPARQL query
21+
- Number of Wikibase items in database
22+
- `total_lexemes`
23+
- SPARQL query
24+
- Number of Wikibase lexemes in database
25+
- `total_properties`
26+
- SPARQL query
27+
- Number of properties in database
28+
- `total_triples`
29+
- SPARQL query
30+
- Number of triples in database
31+
- `total_ei_properties`
32+
- SPARQL query
33+
- Number of external identifier properties in database
34+
- `total_ei_statements`
35+
- SPARQL query
36+
- Number of external identifier statements in database
37+
- `total_url_properties`
38+
- SPARQL query
39+
- Number of url properties in database
40+
- `total_url_statements`
41+
- SPARQL query
42+
- Number of url statements in database
43+
44+
From latest successful Recent Changes observation:
45+
46+
- `recent_changes_observation_date`
47+
- date of recent changes observation
48+
- automatically set when Scraper attempts to fetch data
49+
- `first_change_date`
50+
- oldest record in recent changes list (limited to 30 days before observation date)
51+
- pulled from action api
52+
- `last_change_date`
53+
- newest record in recent changes list (limited as above)
54+
- pulled from action api
55+
- `human_change_count`
56+
- number of records in recent changes list (limited as above), excluding bot contributions
57+
- pulled from action api
58+
- `human_change_user_count`
59+
- number of distinct users with at least one record in recent changes list (limited as above), excluding bot contributions
60+
- pulled from action api
61+
- `bot_change_count`
62+
- number of records in recent changes list (limited as above), limited to bot contributions
63+
- pulled from action api
64+
- `bot_change_user_count`
65+
- number of distinct users with at least one record in recent changes list (limited as above), limited to bot contributions
66+
- pulled from action api
67+
68+
From latest successful Software Version observation:
69+
70+
- `software_version_observation_date`
71+
- date of software version observation
72+
- automatically set when Scraper attempts to fetch data
73+
- `software_name`
74+
- ONLY MediaWiki
75+
- `version`
76+
- MediaWiki version
77+
- Scraped from Special:Version page

export_csv/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Export CSVs"""
2+
3+
from export_csv.metric import export_metric_csv

export_csv/metric.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
"""Quantity CSV"""
2+
3+
from fastapi.responses import StreamingResponse
4+
from sqlalchemy import Select, and_, func, or_, select
5+
6+
from export_csv.util import export_csv
7+
from model.database import (
8+
WikibaseModel,
9+
WikibaseQuantityObservationModel,
10+
WikibaseRecentChangesObservationModel,
11+
WikibaseSoftwareVersionModel,
12+
WikibaseSoftwareVersionObservationModel,
13+
WikibaseSoftwareModel,
14+
WikibaseURLModel,
15+
)
16+
from model.enum import WikibaseType, WikibaseURLType
17+
18+
19+
async def export_metric_csv() -> StreamingResponse:
20+
"""CSV with Requested Metrics"""
21+
22+
query = get_metrics_query()
23+
24+
return await export_csv(
25+
query=query,
26+
export_filename="metrics",
27+
index_col="wikibase_id",
28+
)
29+
30+
31+
def get_metrics_query() -> Select:
32+
"""
33+
Filter Out Offline and Test Wikis
34+
35+
Pull Quantity, Recent Changes, and Software Version Metrics
36+
"""
37+
38+
filtered_wikibase_subquery = (
39+
select(WikibaseModel)
40+
.where(
41+
and_(
42+
WikibaseModel.checked,
43+
or_(
44+
# pylint: disable-next=singleton-comparison
45+
WikibaseModel.wikibase_type == None,
46+
and_(
47+
# WikibaseModel.wikibase_type != WikibaseType.CLOUD,
48+
WikibaseModel.wikibase_type
49+
!= WikibaseType.TEST,
50+
),
51+
),
52+
)
53+
)
54+
.cte(name="filtered_wikibases")
55+
)
56+
57+
quantity_rank_subquery = (
58+
select(
59+
WikibaseQuantityObservationModel.id,
60+
# pylint: disable-next=not-callable
61+
func.rank()
62+
.over(
63+
partition_by=WikibaseQuantityObservationModel.wikibase_id,
64+
order_by=[
65+
WikibaseQuantityObservationModel.observation_date.desc(),
66+
WikibaseQuantityObservationModel.id,
67+
],
68+
)
69+
.label("rank"),
70+
)
71+
.where((WikibaseQuantityObservationModel.returned_data))
72+
.subquery()
73+
)
74+
most_recent_successful_quantity_obs = (
75+
select(WikibaseQuantityObservationModel)
76+
.join(
77+
quantity_rank_subquery,
78+
onclause=and_(
79+
WikibaseQuantityObservationModel.id == quantity_rank_subquery.c.id,
80+
quantity_rank_subquery.c.rank == 1,
81+
),
82+
)
83+
.cte(name="filtered_quantity_observations")
84+
)
85+
86+
rc_rank_subquery = (
87+
select(
88+
WikibaseRecentChangesObservationModel.id,
89+
# pylint: disable-next=not-callable
90+
func.rank()
91+
.over(
92+
partition_by=WikibaseRecentChangesObservationModel.wikibase_id,
93+
order_by=[
94+
WikibaseRecentChangesObservationModel.observation_date.desc(),
95+
WikibaseRecentChangesObservationModel.id,
96+
],
97+
)
98+
.label("rank"),
99+
)
100+
.where((WikibaseRecentChangesObservationModel.returned_data))
101+
.subquery()
102+
)
103+
most_recent_successful_rc_obs = (
104+
select(WikibaseRecentChangesObservationModel)
105+
.join(
106+
rc_rank_subquery,
107+
onclause=and_(
108+
WikibaseRecentChangesObservationModel.id == rc_rank_subquery.c.id,
109+
rc_rank_subquery.c.rank == 1,
110+
),
111+
)
112+
.cte(name="filtered_recent_changes_observations")
113+
)
114+
115+
sv_rank_subquery = (
116+
select(
117+
WikibaseSoftwareVersionObservationModel.id,
118+
# pylint: disable-next=not-callable
119+
func.rank()
120+
.over(
121+
partition_by=WikibaseSoftwareVersionObservationModel.wikibase_id,
122+
order_by=[
123+
WikibaseSoftwareVersionObservationModel.observation_date.desc(),
124+
WikibaseSoftwareVersionObservationModel.id,
125+
],
126+
)
127+
.label("rank"),
128+
)
129+
.where((WikibaseSoftwareVersionObservationModel.returned_data))
130+
.subquery()
131+
)
132+
most_recent_successful_sv_obs = (
133+
select(
134+
WikibaseSoftwareVersionObservationModel.wikibase_id,
135+
WikibaseSoftwareVersionObservationModel.observation_date,
136+
WikibaseSoftwareVersionModel.version,
137+
WikibaseSoftwareModel.software_name,
138+
)
139+
.select_from(WikibaseSoftwareVersionObservationModel)
140+
.join(WikibaseSoftwareVersionModel)
141+
.join(WikibaseSoftwareModel)
142+
.join(
143+
sv_rank_subquery,
144+
onclause=and_(
145+
WikibaseSoftwareVersionObservationModel.id == sv_rank_subquery.c.id,
146+
sv_rank_subquery.c.rank == 1,
147+
),
148+
)
149+
.where(WikibaseSoftwareModel.software_name == "MediaWiki")
150+
.cte(name="filtered_software_version_observations")
151+
)
152+
153+
query = (
154+
select(
155+
filtered_wikibase_subquery.c.id.label("wikibase_id"),
156+
filtered_wikibase_subquery.c.wb_type.label("wikibase_type"),
157+
WikibaseURLModel.url.label("base_url"),
158+
most_recent_successful_quantity_obs.c.date.label(
159+
"quantity_observation_date"
160+
),
161+
most_recent_successful_quantity_obs.c.total_items,
162+
most_recent_successful_quantity_obs.c.total_lexemes,
163+
most_recent_successful_quantity_obs.c.total_properties,
164+
most_recent_successful_quantity_obs.c.total_triples,
165+
most_recent_successful_quantity_obs.c.total_external_identifier_properties.label(
166+
"total_ei_properties"
167+
),
168+
most_recent_successful_quantity_obs.c.total_external_identifier_statements.label(
169+
"total_ei_statements"
170+
),
171+
most_recent_successful_quantity_obs.c.total_url_properties,
172+
most_recent_successful_quantity_obs.c.total_url_statements,
173+
most_recent_successful_rc_obs.c.date.label(
174+
"recent_changes_observation_date"
175+
),
176+
most_recent_successful_rc_obs.c.first_change_date,
177+
most_recent_successful_rc_obs.c.last_change_date,
178+
most_recent_successful_rc_obs.c.human_change_count,
179+
most_recent_successful_rc_obs.c.human_change_user_count,
180+
most_recent_successful_rc_obs.c.bot_change_count,
181+
most_recent_successful_rc_obs.c.bot_change_user_count,
182+
most_recent_successful_sv_obs.c.observation_date.label(
183+
"software_version_observation_date"
184+
),
185+
most_recent_successful_sv_obs.c.software_name,
186+
most_recent_successful_sv_obs.c.version,
187+
)
188+
.join(
189+
WikibaseURLModel,
190+
onclause=and_(
191+
filtered_wikibase_subquery.c.id == WikibaseURLModel.wikibase_id,
192+
WikibaseURLModel.url_type == WikibaseURLType.BASE_URL,
193+
),
194+
)
195+
.join(
196+
most_recent_successful_quantity_obs,
197+
onclause=filtered_wikibase_subquery.c.id
198+
== most_recent_successful_quantity_obs.c.wikibase_id,
199+
isouter=True,
200+
)
201+
.join(
202+
most_recent_successful_rc_obs,
203+
onclause=filtered_wikibase_subquery.c.id
204+
== most_recent_successful_rc_obs.c.wikibase_id,
205+
isouter=True,
206+
)
207+
.join(
208+
most_recent_successful_sv_obs,
209+
onclause=filtered_wikibase_subquery.c.id
210+
== most_recent_successful_sv_obs.c.wikibase_id,
211+
isouter=True,
212+
)
213+
)
214+
215+
return query

export_csv/util.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Utilities"""
2+
3+
from typing import Optional
4+
from fastapi.responses import Response
5+
import pandas
6+
from sqlalchemy import Connection, Select
7+
8+
from data.database_connection import async_engine
9+
10+
11+
def _read_sql_query(con: Connection, stmt: Select, index_col: Optional[str] = None):
12+
return pandas.read_sql_query(stmt, con, index_col=index_col)
13+
14+
15+
async def read_sql_query(
16+
stmt: Select, index_col: Optional[str] = None
17+
) -> pandas.DataFrame:
18+
"""Read SQL to DataFrame"""
19+
20+
async with async_engine.begin() as conn:
21+
df = await conn.run_sync(_read_sql_query, stmt, index_col=index_col)
22+
return df
23+
24+
25+
async def export_csv(
26+
query: Select,
27+
export_filename: str,
28+
index_col: Optional[str] = None,
29+
):
30+
"""Export CSV"""
31+
32+
df = await read_sql_query(query, index_col=index_col)
33+
if index_col == "wikibase_id":
34+
assert len(set(df.index)) == len(df), "Returned Multiple Rows per Wikibase"
35+
36+
csv = df.to_csv()
37+
del df
38+
39+
headers = {"Content-Disposition": f'attachment; filename="{export_filename}.csv"'}
40+
return Response(csv, headers=headers, media_type="text/csv")

fetch_data/api_data/recent_changes_data/create_recent_changes_observation.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@
2525
)
2626
from fetch_data.utils import counts, get_wikibase_from_database
2727
from logger import logger
28-
from model.database import (
29-
WikibaseModel,
30-
WikibaseRecentChangesObservationModel,
31-
)
28+
from model.database import WikibaseModel, WikibaseRecentChangesObservationModel
3229

3330

3431
async def create_recent_changes_observation(wikibase_id: int) -> bool:
@@ -65,9 +62,7 @@ async def create_recent_changes_observation(wikibase_id: int) -> bool:
6562
)
6663

6764
observation = create_recent_changes(
68-
recent_changes_list_humans,
69-
recent_changes_list_bots,
70-
observation,
65+
recent_changes_list_humans, recent_changes_list_bots, observation
7166
)
7267
observation.returned_data = True
7368
except (

0 commit comments

Comments
 (0)