Skip to content

Commit 646e1d6

Browse files
authored
Merge pull request mattmakai#198 from huangsam/add-bad-locations
Add support for bad url locations
2 parents f81cee5 + db5feb1 commit 646e1d6

File tree

1 file changed

+26
-11
lines changed

1 file changed

+26
-11
lines changed

check_urls.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
#!/usr/bin/env python
2-
from concurrent import futures
32
import multiprocessing as mp
43
import os
54
import json
65
import uuid
6+
from concurrent import futures
7+
from collections import defaultdict
78

89
from bs4 import BeautifulSoup
910
from markdown import markdown
@@ -18,20 +19,23 @@
1819
URL_BOT_ID = f'Bot {str(uuid.uuid4())}'
1920

2021

21-
def extract_urls_from_html(content, all_urls):
22+
def extract_urls_from_html(content):
2223
soup = BeautifulSoup(content, 'html.parser')
24+
html_urls = set()
2325
for a in soup.find_all('a', href=True):
2426
url = a['href']
2527
if url.startswith('http'):
26-
all_urls.add(url)
28+
html_urls.add(url)
29+
return html_urls
2730

2831

2932
def extract_urls(discover_path):
3033
exclude = ['.git', '.vscode']
31-
all_urls = set()
34+
all_urls = defaultdict(list)
3235
max_strlen = -1
3336
for root, dirs, files in os.walk(discover_path, topdown=True):
3437
dirs[:] = [d for d in dirs if d not in exclude]
38+
short_root = root.lstrip(discover_path)
3539
for file in files:
3640
output = f'Currently checking: file={file}'
3741
file_path = os.path.join(root, file)
@@ -40,10 +44,14 @@ def extract_urls(discover_path):
4044
print(output.ljust(max_strlen), end='\r')
4145
if file_path.endswith('.html'):
4246
content = open(file_path)
43-
extract_urls_from_html(content, all_urls)
47+
extract_urls_from_html(content)
4448
elif file_path.endswith('.markdown'):
4549
content = markdown(open(file_path).read())
46-
extract_urls_from_html(content, all_urls)
50+
else:
51+
continue
52+
html_urls = extract_urls_from_html(content)
53+
for url in html_urls:
54+
all_urls[url].append(os.path.join(short_root, file))
4755
return all_urls
4856

4957

@@ -87,20 +95,27 @@ def bad_url(url_status):
8795

8896
def main():
8997
print('Extract urls...')
90-
all_urls = extract_urls(os.getcwd())
98+
all_urls = extract_urls(os.getcwd() + os.path.sep + 'content')
9199
print('\nCheck urls...')
92-
bad_urls = {}
100+
bad_url_status = {}
93101
url_id = 1
94102
max_strlen = -1
95-
for url_path, url_status in run_workers(get_url_status, all_urls):
103+
for url_path, url_status in run_workers(get_url_status, all_urls.keys()):
96104
output = f'Currently checking: id={url_id} host={urllib3.util.parse_url(url_path).host}'
97105
if max_strlen < len(output):
98106
max_strlen = len(output)
99107
print(output.ljust(max_strlen), end='\r')
100108
if bad_url(url_status) is True:
101-
bad_urls[url_path] = url_status
109+
bad_url_status[url_path] = url_status
102110
url_id += 1
103-
print(f'\nBad urls: {json.dumps(bad_urls, indent=4)}')
111+
bad_url_location = {
112+
bad_url: all_urls[bad_url]
113+
for bad_url in bad_url_status
114+
}
115+
status_content = json.dumps(bad_url_status, indent=4)
116+
location_content = json.dumps(bad_url_location, indent=4)
117+
print(f'\nBad url status: {status_content}')
118+
print(f'\nBad url locations: {location_content}')
104119

105120

106121
if __name__ == '__main__':

0 commit comments

Comments
 (0)