Skip to content

Commit 9706622

Browse files
committed
Merge branch 'master' of github.com:mattmakai/fullstackpython.com
2 parents 9b7e3fd + 20d6b35 commit 9706622

File tree

1 file changed

+64
-23
lines changed

1 file changed

+64
-23
lines changed

check_urls.py

Lines changed: 64 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
#!/usr/bin/env python
2-
import multiprocessing as mp
32
import os
4-
import json
5-
import uuid
3+
from argparse import ArgumentParser
64
from concurrent import futures
75
from collections import defaultdict
6+
from functools import partial
7+
from json import dumps
8+
from multiprocessing import cpu_count
9+
from sys import argv
10+
from uuid import uuid4
811

9-
from bs4 import BeautifulSoup
10-
from markdown import markdown
1112
import requests
1213
import urllib3
14+
from bs4 import BeautifulSoup
15+
from markdown import markdown
1316

1417

1518
# Ignore security hazard since certs SHOULD be trusted (https)
1619
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
1720

1821
# Avoid rate limiting (tcp)
19-
URL_BOT_ID = f'Bot {str(uuid.uuid4())}'
22+
URL_BOT_ID = f'Bot {str(uuid4())}'
2023

2124

2225
def extract_urls_from_html(content):
@@ -35,7 +38,7 @@ def extract_urls(discover_path):
3538
max_strlen = -1
3639
for root, dirs, files in os.walk(discover_path, topdown=True):
3740
dirs[:] = [d for d in dirs if d not in exclude]
38-
short_root = root.lstrip(discover_path)
41+
short_root = root.replace(discover_path, '')
3942
for file in files:
4043
output = f'Currently checking: file={file}'
4144
file_path = os.path.join(root, file)
@@ -55,34 +58,41 @@ def extract_urls(discover_path):
5558
return all_urls
5659

5760

58-
def run_workers(work, data, worker_threads=mp.cpu_count()*4):
59-
with futures.ThreadPoolExecutor(max_workers=worker_threads) as executor:
61+
def run_workers(work, data, threads, **kwargs):
62+
work_partial = partial(work, **kwargs)
63+
with futures.ThreadPoolExecutor(max_workers=threads) as executor:
6064
future_to_result = {
61-
executor.submit(work, arg): arg for arg in data}
65+
executor.submit(work_partial, arg): arg
66+
for arg in data
67+
}
6268
for future in futures.as_completed(future_to_result):
6369
yield future.result()
6470

6571

66-
def get_url_status(url):
72+
def get_url_status(url, timeout, retries):
6773
for local in ('localhost', '127.0.0.1', 'app_server'):
6874
if url.startswith('http://' + local):
6975
return (url, 0)
7076
clean_url = url.strip('?.')
7177
try:
72-
response = requests.get(
73-
clean_url, verify=False, timeout=10.0,
74-
headers={'User-Agent': URL_BOT_ID})
75-
return (clean_url, response.status_code)
78+
with requests.Session() as session:
79+
adapter = requests.adapters.HTTPAdapter(max_retries=retries)
80+
session.mount('http://', adapter)
81+
session.mount('https://', adapter)
82+
response = session.get(
83+
clean_url, verify=False, timeout=timeout,
84+
headers={'User-Agent': URL_BOT_ID})
85+
return (clean_url, response.status_code)
7686
except requests.exceptions.Timeout:
7787
return (clean_url, 504)
78-
except requests.exceptions.ConnectionError:
79-
return (clean_url, -1)
8088
except requests.exceptions.TooManyRedirects:
89+
return (clean_url, -301)
90+
except requests.exceptions.ConnectionError:
8191
return (clean_url, -1)
8292

8393

8494
def bad_url(url_status):
85-
if url_status == -1:
95+
if url_status == -301 or url_status == -1:
8696
return True
8797
elif url_status == 401 or url_status == 403:
8898
return False
@@ -93,15 +103,46 @@ def bad_url(url_status):
93103
return False
94104

95105

106+
def parse_args(argv):
107+
parser = ArgumentParser(
108+
description='Check for bad urls in the HTML content.',
109+
add_help=True)
110+
parser.add_argument(
111+
'-timeout', '--url-timeout',
112+
default=10.0,
113+
type=float,
114+
dest='timeout',
115+
help='Timeout in seconds to wait for url')
116+
parser.add_argument(
117+
'-retries', '--url-retries',
118+
default=5,
119+
type=int,
120+
dest='retries',
121+
help='Number of url retries')
122+
parser.add_argument(
123+
'-threads', '--num-threads',
124+
default=cpu_count()*4,
125+
type=int,
126+
dest='threads',
127+
help='Number of threads to run with')
128+
return parser.parse_args(argv)
129+
130+
96131
def main():
132+
args = parse_args(argv[1:])
97133
print('Extract urls...')
98-
all_urls = extract_urls(os.getcwd() + os.path.sep + 'content')
134+
all_urls = extract_urls(os.getcwd())
99135
print('\nCheck urls...')
100136
bad_url_status = {}
101137
url_id = 1
102138
max_strlen = -1
103-
for url_path, url_status in run_workers(get_url_status, all_urls.keys()):
104-
output = f'Currently checking: id={url_id} host={urllib3.util.parse_url(url_path).host}'
139+
for url_path, url_status in run_workers(
140+
get_url_status, all_urls.keys(),
141+
threads=args.threads, timeout=args.timeout, retries=args.retries):
142+
output = (
143+
f'Currently checking: id={url_id} '
144+
f'host={urllib3.util.parse_url(url_path).host}'
145+
)
105146
if max_strlen < len(output):
106147
max_strlen = len(output)
107148
print(output.ljust(max_strlen), end='\r')
@@ -112,8 +153,8 @@ def main():
112153
bad_url: all_urls[bad_url]
113154
for bad_url in bad_url_status
114155
}
115-
status_content = json.dumps(bad_url_status, indent=4)
116-
location_content = json.dumps(bad_url_location, indent=4)
156+
status_content = dumps(bad_url_status, indent=4)
157+
location_content = dumps(bad_url_location, indent=4)
117158
print(f'\nBad url status: {status_content}')
118159
print(f'\nBad url locations: {location_content}')
119160

0 commit comments

Comments
 (0)