forked from MeltanoLabs/tap-github
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraping.py
More file actions
173 lines (136 loc) · 5.63 KB
/
scraping.py
File metadata and controls
173 lines (136 loc) · 5.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""Utility functions for scraping https://github.com
Inspired by https://github.com/dogsheep/github-to-sqlite/pull/70
"""
from __future__ import annotations
import logging
import re
import time
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any
from urllib.parse import urlparse
import requests
if TYPE_CHECKING:
from collections.abc import Iterable
from bs4 import NavigableString, Tag
used_by_regex = re.compile(" {3}Used by ")
contributors_regex = re.compile(" {3}Contributors ")
def parse_int(s: str) -> int:
"""For example, '1,808' -> 1808."""
return int(s.strip().replace(",", "").replace("+", ""))
def scrape_dependents(
response: requests.Response, logger: logging.Logger | None = None
) -> Iterable[dict[str, Any]]:
from bs4 import BeautifulSoup
logger = logger or logging.getLogger("scraping")
soup = BeautifulSoup(response.content, "html.parser")
# Navigate through Package toggle if present
base_url = urlparse(response.url).hostname or "github.com"
options = soup.find_all("a", class_="select-menu-item")
links = [link["href"] for link in options] if len(options) > 0 else [response.url]
logger.debug(links)
for link in links:
yield from _scrape_dependents(f"https://{base_url}/{link}", logger)
def _scrape_dependents(url: str, logger: logging.Logger) -> Iterable[dict[str, Any]]:
# Optional dependency:
from bs4 import BeautifulSoup
s = requests.Session()
while url:
logger.debug(url)
response = s.get(url)
soup = BeautifulSoup(response.content, "html.parser")
repo_names = [
(a["href"] if not isinstance(a["href"], list) else a["href"][0]).lstrip("/")
for a in soup.select("a[data-hovercard-type=repository]")
]
stars = [
parse_int(s.next_sibling)
for s in soup.find_all("svg", {"class": "octicon octicon-star"})
]
forks = [
parse_int(s.next_sibling)
for s in soup.find_all("svg", {"class": "octicon octicon-repo-forked"})
]
if not len(repo_names) == len(stars) == len(forks):
raise IndexError(
"Could not find star and fork info. Maybe the GitHub page format has changed?" # noqa: E501
)
repos = [
{"name_with_owner": name, "stars": s, "forks": f}
for name, s, f in zip(repo_names, stars, forks, strict=False)
]
logger.debug(repos)
yield from repos
# next page?
try:
next_link: Tag = soup.select(".paginate-container")[0].find_all(
"a", text="Next"
)[0]
except IndexError:
break
if next_link is not None:
href = next_link["href"]
url = str(href if not isinstance(href, list) else href[0])
time.sleep(1)
else:
url = "" # type: ignore[unreachable]
def parse_counter(tag: Tag | NavigableString | None) -> int:
"""
Extract a count of [issues|PR|contributors...] from an HTML tag.
For very high numbers, we only get an approximate value as github
does not provide the actual number.
"""
if not tag:
return 0
try:
if tag == "\n":
return 0
title = tag["title"] # type: ignore[index] # ty:ignore[invalid-argument-type]
title_string = title if isinstance(title, str) else title[0]
return parse_int(title_string)
except (KeyError, ValueError) as e:
raise IndexError(
f"Could not parse counter {tag}. Maybe the GitHub page format has changed?"
) from e
def scrape_metrics(
response: requests.Response, logger: logging.Logger | None = None
) -> Iterable[dict[str, Any]]:
from bs4 import BeautifulSoup
logger = logger or logging.getLogger("scraping")
soup = BeautifulSoup(response.content, "html.parser")
try:
issues = parse_counter(soup.find("span", id="issues-repo-tab-count"))
prs = parse_counter(soup.find("span", id="pull-requests-repo-tab-count"))
except IndexError as e:
# These two items should exist. We raise an error if we could not find them.
raise IndexError(
"Could not find issues or prs info. Maybe the GitHub page format has changed?" # noqa: E501
) from e
dependents_node = soup.find(string=used_by_regex)
# verify that we didn't hit some random text in the page.
# sometimes the dependents section isn't shown on the page either
dependents_node_parent = getattr(dependents_node, "parent", None)
dependents: int = 0
if dependents_node_parent is not None and "href" in dependents_node_parent: # noqa: SIM102
if dependents_node_parent["href"].endswith("/network/dependents"):
dependents = parse_counter(getattr(dependents_node, "next_element", None))
# likewise, handle edge cases with contributors
contributors_node = soup.find(string=contributors_regex)
contributors_node_parent = getattr(contributors_node, "parent", None)
contributors: int = 0
if contributors_node_parent is not None and "href" in contributors_node_parent: # noqa: SIM102
if contributors_node_parent["href"].endswith("/graphs/contributors"):
contributors = parse_counter(
getattr(contributors_node, "next_element", None),
)
fetched_at = datetime.now(tz=timezone.utc)
metrics = [
{
"open_issues": issues,
"open_prs": prs,
"dependents": dependents,
"contributors": contributors,
"fetched_at": fetched_at,
}
]
logger.debug(metrics)
return metrics