Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
"""Main Application"""

from contextlib import asynccontextmanager
from typing import Optional
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import PlainTextResponse
from strawberry.fastapi import GraphQLRouter

from export_csv import export_metric_csv
from model.strawberry import schema
from resolvers.authentication import authenticate_token
from schedule import scheduler


Expand Down Expand Up @@ -41,3 +45,18 @@ def read_root():


app.include_router(GraphQLRouter(schema=schema), prefix="/graphql")


@app.get("/csv/metrics")
async def metric_csv(authorization: Optional[str] = None):
"""Quantity CSV"""

if authorization is None:
return PlainTextResponse("Authorization Missing", 403)

try:
authenticate_token(authorization)
except AssertionError:
return PlainTextResponse("Authorization Failed", 403)

return await export_metric_csv()
77 changes: 77 additions & 0 deletions docs/metrics_csv.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Metrics CSV

1 row per Wikibase, with the following columns:

- `wikibase_id`
- internal identifier
- automatically assigned upon adding to database
- `wikibase_type`
- manually set
- CSV excludes TEST wikibases
- `base_url`
- manually set

From latest successful Quantity observation:

- `quantity_observation_date`
- date of quantity observation
- automatically set when Scraper attempts to fetch data
- `total_items`
- SPARQL query
- Number of Wikibase items in database
- `total_lexemes`
- SPARQL query
- Number of Wikibase lexemes in database
- `total_properties`
- SPARQL query
- Number of properties in database
- `total_triples`
- SPARQL query
- Number of triples in database
- `total_ei_properties`
- SPARQL query
- Number of external identifier properties in database
- `total_ei_statements`
- SPARQL query
- Number of external identifier statements in database
- `total_url_properties`
- SPARQL query
- Number of url properties in database
- `total_url_statements`
- SPARQL query
- Number of url statements in database

From latest successful Recent Changes observation:

- `recent_changes_observation_date`
- date of recent changes observation
- automatically set when Scraper attempts to fetch data
- `first_change_date`
- oldest record in recent changes list (limited to 30 days before observation date)
- pulled from action api
- `last_change_date`
- newest record in recent changes list (limited as above)
- pulled from action api
- `human_change_count`
- number of records in recent changes list (limited as above), excluding bot contributions
- pulled from action api
- `human_change_user_count`
- number of distinct users with at least one record in recent changes list (limited as above), excluding bot contributions
- pulled from action api
- `bot_change_count`
- number of records in recent changes list (limited as above), limited to bot contributions
- pulled from action api
- `bot_change_user_count`
- number of distinct users with at least one record in recent changes list (limited as above), limited to bot contributions
- pulled from action api

From latest successful Software Version observation:

- `software_version_observation_date`
- date of software version observation
- automatically set when Scraper attempts to fetch data
- `software_name`
- ONLY MediaWiki
- `version`
- MediaWiki version
- Scraped from Special:Version page
3 changes: 3 additions & 0 deletions export_csv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""Export CSVs"""

from export_csv.metric import export_metric_csv
215 changes: 215 additions & 0 deletions export_csv/metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
"""Quantity CSV"""

from fastapi.responses import StreamingResponse
from sqlalchemy import Select, and_, func, or_, select

from export_csv.util import export_csv
from model.database import (
WikibaseModel,
WikibaseQuantityObservationModel,
WikibaseRecentChangesObservationModel,
WikibaseSoftwareVersionModel,
WikibaseSoftwareVersionObservationModel,
WikibaseSoftwareModel,
WikibaseURLModel,
)
from model.enum import WikibaseType, WikibaseURLType


async def export_metric_csv() -> StreamingResponse:
"""CSV with Requested Metrics"""

query = get_metrics_query()

return await export_csv(
query=query,
export_filename="metrics",
index_col="wikibase_id",
)


def get_metrics_query() -> Select:
"""
Filter Out Offline and Test Wikis

Pull Quantity, Recent Changes, and Software Version Metrics
"""

filtered_wikibase_subquery = (
select(WikibaseModel)
.where(
and_(
WikibaseModel.checked,
or_(
# pylint: disable-next=singleton-comparison
WikibaseModel.wikibase_type == None,
and_(
# WikibaseModel.wikibase_type != WikibaseType.CLOUD,
WikibaseModel.wikibase_type
!= WikibaseType.TEST,
),
),
)
)
.cte(name="filtered_wikibases")
)

quantity_rank_subquery = (
select(
WikibaseQuantityObservationModel.id,
# pylint: disable-next=not-callable
func.rank()
.over(
partition_by=WikibaseQuantityObservationModel.wikibase_id,
order_by=[
WikibaseQuantityObservationModel.observation_date.desc(),
WikibaseQuantityObservationModel.id,
],
)
.label("rank"),
)
.where((WikibaseQuantityObservationModel.returned_data))
.subquery()
)
most_recent_successful_quantity_obs = (
select(WikibaseQuantityObservationModel)
.join(
quantity_rank_subquery,
onclause=and_(
WikibaseQuantityObservationModel.id == quantity_rank_subquery.c.id,
quantity_rank_subquery.c.rank == 1,
),
)
.cte(name="filtered_quantity_observations")
)

rc_rank_subquery = (
select(
WikibaseRecentChangesObservationModel.id,
# pylint: disable-next=not-callable
func.rank()
.over(
partition_by=WikibaseRecentChangesObservationModel.wikibase_id,
order_by=[
WikibaseRecentChangesObservationModel.observation_date.desc(),
WikibaseRecentChangesObservationModel.id,
],
)
.label("rank"),
)
.where((WikibaseRecentChangesObservationModel.returned_data))
.subquery()
)
most_recent_successful_rc_obs = (
select(WikibaseRecentChangesObservationModel)
.join(
rc_rank_subquery,
onclause=and_(
WikibaseRecentChangesObservationModel.id == rc_rank_subquery.c.id,
rc_rank_subquery.c.rank == 1,
),
)
.cte(name="filtered_recent_changes_observations")
)

sv_rank_subquery = (
select(
WikibaseSoftwareVersionObservationModel.id,
# pylint: disable-next=not-callable
func.rank()
.over(
partition_by=WikibaseSoftwareVersionObservationModel.wikibase_id,
order_by=[
WikibaseSoftwareVersionObservationModel.observation_date.desc(),
WikibaseSoftwareVersionObservationModel.id,
],
)
.label("rank"),
)
.where((WikibaseSoftwareVersionObservationModel.returned_data))
.subquery()
)
most_recent_successful_sv_obs = (
select(
WikibaseSoftwareVersionObservationModel.wikibase_id,
WikibaseSoftwareVersionObservationModel.observation_date,
WikibaseSoftwareVersionModel.version,
WikibaseSoftwareModel.software_name,
)
.select_from(WikibaseSoftwareVersionObservationModel)
.join(WikibaseSoftwareVersionModel)
.join(WikibaseSoftwareModel)
.join(
sv_rank_subquery,
onclause=and_(
WikibaseSoftwareVersionObservationModel.id == sv_rank_subquery.c.id,
sv_rank_subquery.c.rank == 1,
),
)
.where(WikibaseSoftwareModel.software_name == "MediaWiki")
.cte(name="filtered_software_version_observations")
)

query = (
select(
filtered_wikibase_subquery.c.id.label("wikibase_id"),
filtered_wikibase_subquery.c.wb_type.label("wikibase_type"),
WikibaseURLModel.url.label("base_url"),
most_recent_successful_quantity_obs.c.date.label(
"quantity_observation_date"
),
most_recent_successful_quantity_obs.c.total_items,
most_recent_successful_quantity_obs.c.total_lexemes,
most_recent_successful_quantity_obs.c.total_properties,
most_recent_successful_quantity_obs.c.total_triples,
most_recent_successful_quantity_obs.c.total_external_identifier_properties.label(
"total_ei_properties"
),
most_recent_successful_quantity_obs.c.total_external_identifier_statements.label(
"total_ei_statements"
),
most_recent_successful_quantity_obs.c.total_url_properties,
most_recent_successful_quantity_obs.c.total_url_statements,
most_recent_successful_rc_obs.c.date.label(
"recent_changes_observation_date"
),
most_recent_successful_rc_obs.c.first_change_date,
most_recent_successful_rc_obs.c.last_change_date,
most_recent_successful_rc_obs.c.human_change_count,
most_recent_successful_rc_obs.c.human_change_user_count,
most_recent_successful_rc_obs.c.bot_change_count,
most_recent_successful_rc_obs.c.bot_change_user_count,
most_recent_successful_sv_obs.c.observation_date.label(
"software_version_observation_date"
),
most_recent_successful_sv_obs.c.software_name,
most_recent_successful_sv_obs.c.version,
)
.join(
WikibaseURLModel,
onclause=and_(
filtered_wikibase_subquery.c.id == WikibaseURLModel.wikibase_id,
WikibaseURLModel.url_type == WikibaseURLType.BASE_URL,
),
)
.join(
most_recent_successful_quantity_obs,
onclause=filtered_wikibase_subquery.c.id
== most_recent_successful_quantity_obs.c.wikibase_id,
isouter=True,
)
.join(
most_recent_successful_rc_obs,
onclause=filtered_wikibase_subquery.c.id
== most_recent_successful_rc_obs.c.wikibase_id,
isouter=True,
)
.join(
most_recent_successful_sv_obs,
onclause=filtered_wikibase_subquery.c.id
== most_recent_successful_sv_obs.c.wikibase_id,
isouter=True,
)
)

return query
40 changes: 40 additions & 0 deletions export_csv/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Utilities"""

from typing import Optional
from fastapi.responses import Response
import pandas
from sqlalchemy import Connection, Select

from data.database_connection import async_engine


def _read_sql_query(con: Connection, stmt: Select, index_col: Optional[str] = None):
return pandas.read_sql_query(stmt, con, index_col=index_col)


async def read_sql_query(
stmt: Select, index_col: Optional[str] = None
) -> pandas.DataFrame:
"""Read SQL to DataFrame"""

async with async_engine.begin() as conn:
df = await conn.run_sync(_read_sql_query, stmt, index_col=index_col)
return df


async def export_csv(
query: Select,
export_filename: str,
index_col: Optional[str] = None,
):
"""Export CSV"""

df = await read_sql_query(query, index_col=index_col)
if index_col == "wikibase_id":
assert len(set(df.index)) == len(df), "Returned Multiple Rows per Wikibase"

csv = df.to_csv()
del df

headers = {"Content-Disposition": f'attachment; filename="{export_filename}.csv"'}
return Response(csv, headers=headers, media_type="text/csv")
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@
)
from fetch_data.utils import get_wikibase_from_database
from logger import logger
from model.database import (
WikibaseModel,
WikibaseRecentChangesObservationModel,
)
from model.database import WikibaseModel, WikibaseRecentChangesObservationModel


async def create_recent_changes_observation(wikibase_id: int) -> bool:
Expand Down Expand Up @@ -65,9 +62,7 @@ async def create_recent_changes_observation(wikibase_id: int) -> bool:
)

observation = create_recent_changes(
recent_changes_list_humans,
recent_changes_list_bots,
observation,
recent_changes_list_humans, recent_changes_list_bots, observation
)
observation.returned_data = True
except (
Expand Down
Loading