Skip to content

Commit 20d6b35

Browse files
authored
Merge pull request mattmakai#200 from huangsam/add-argparse
Add support for argparse
2 parents afc8459 + af8c5d5 commit 20d6b35

File tree

1 file changed

+58
-21
lines changed

1 file changed

+58
-21
lines changed

check_urls.py

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
#!/usr/bin/env python
2-
import multiprocessing as mp
32
import os
4-
import json
5-
import uuid
3+
from argparse import ArgumentParser
64
from concurrent import futures
75
from collections import defaultdict
6+
from functools import partial
7+
from json import dumps
8+
from multiprocessing import cpu_count
9+
from sys import argv
10+
from uuid import uuid4
811

9-
from bs4 import BeautifulSoup
10-
from markdown import markdown
1112
import requests
1213
import urllib3
14+
from bs4 import BeautifulSoup
15+
from markdown import markdown
1316

1417

1518
# Ignore security hazard since certs SHOULD be trusted (https)
1619
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
1720

1821
# Avoid rate limiting (tcp)
19-
URL_BOT_ID = f'Bot {str(uuid.uuid4())}'
22+
URL_BOT_ID = f'Bot {str(uuid4())}'
2023

2124

2225
def extract_urls_from_html(content):
@@ -35,7 +38,7 @@ def extract_urls(discover_path):
3538
max_strlen = -1
3639
for root, dirs, files in os.walk(discover_path, topdown=True):
3740
dirs[:] = [d for d in dirs if d not in exclude]
38-
short_root = root.lstrip(discover_path)
41+
short_root = root.replace(discover_path, '')
3942
for file in files:
4043
output = f'Currently checking: file={file}'
4144
file_path = os.path.join(root, file)
@@ -55,38 +58,41 @@ def extract_urls(discover_path):
5558
return all_urls
5659

5760

58-
def run_workers(work, data, worker_threads=mp.cpu_count()*4):
59-
with futures.ThreadPoolExecutor(max_workers=worker_threads) as executor:
61+
def run_workers(work, data, threads, **kwargs):
62+
work_partial = partial(work, **kwargs)
63+
with futures.ThreadPoolExecutor(max_workers=threads) as executor:
6064
future_to_result = {
61-
executor.submit(work, arg): arg for arg in data}
65+
executor.submit(work_partial, arg): arg
66+
for arg in data
67+
}
6268
for future in futures.as_completed(future_to_result):
6369
yield future.result()
6470

6571

66-
def get_url_status(url):
72+
def get_url_status(url, timeout, retries):
6773
for local in ('localhost', '127.0.0.1', 'app_server'):
6874
if url.startswith('http://' + local):
6975
return (url, 0)
7076
clean_url = url.strip('?.')
7177
try:
7278
with requests.Session() as session:
73-
adapter = requests.adapters.HTTPAdapter(max_retries=10)
79+
adapter = requests.adapters.HTTPAdapter(max_retries=retries)
7480
session.mount('http://', adapter)
7581
session.mount('https://', adapter)
7682
response = session.get(
77-
clean_url, verify=False, timeout=10.0,
83+
clean_url, verify=False, timeout=timeout,
7884
headers={'User-Agent': URL_BOT_ID})
7985
return (clean_url, response.status_code)
8086
except requests.exceptions.Timeout:
8187
return (clean_url, 504)
82-
except requests.exceptions.ConnectionError:
83-
return (clean_url, -1)
8488
except requests.exceptions.TooManyRedirects:
89+
return (clean_url, -301)
90+
except requests.exceptions.ConnectionError:
8591
return (clean_url, -1)
8692

8793

8894
def bad_url(url_status):
89-
if url_status == -1:
95+
if url_status == -301 or url_status == -1:
9096
return True
9197
elif url_status == 401 or url_status == 403:
9298
return False
@@ -97,15 +103,46 @@ def bad_url(url_status):
97103
return False
98104

99105

106+
def parse_args(argv):
107+
parser = ArgumentParser(
108+
description='Check for bad urls in the HTML content.',
109+
add_help=True)
110+
parser.add_argument(
111+
'-timeout', '--url-timeout',
112+
default=10.0,
113+
type=float,
114+
dest='timeout',
115+
help='Timeout in seconds to wait for url')
116+
parser.add_argument(
117+
'-retries', '--url-retries',
118+
default=5,
119+
type=int,
120+
dest='retries',
121+
help='Number of url retries')
122+
parser.add_argument(
123+
'-threads', '--num-threads',
124+
default=cpu_count()*4,
125+
type=int,
126+
dest='threads',
127+
help='Number of threads to run with')
128+
return parser.parse_args(argv)
129+
130+
100131
def main():
132+
args = parse_args(argv[1:])
101133
print('Extract urls...')
102-
all_urls = extract_urls(os.getcwd() + os.path.sep + 'content')
134+
all_urls = extract_urls(os.getcwd())
103135
print('\nCheck urls...')
104136
bad_url_status = {}
105137
url_id = 1
106138
max_strlen = -1
107-
for url_path, url_status in run_workers(get_url_status, all_urls.keys()):
108-
output = f'Currently checking: id={url_id} host={urllib3.util.parse_url(url_path).host}'
139+
for url_path, url_status in run_workers(
140+
get_url_status, all_urls.keys(),
141+
threads=args.threads, timeout=args.timeout, retries=args.retries):
142+
output = (
143+
f'Currently checking: id={url_id} '
144+
f'host={urllib3.util.parse_url(url_path).host}'
145+
)
109146
if max_strlen < len(output):
110147
max_strlen = len(output)
111148
print(output.ljust(max_strlen), end='\r')
@@ -116,8 +153,8 @@ def main():
116153
bad_url: all_urls[bad_url]
117154
for bad_url in bad_url_status
118155
}
119-
status_content = json.dumps(bad_url_status, indent=4)
120-
location_content = json.dumps(bad_url_location, indent=4)
156+
status_content = dumps(bad_url_status, indent=4)
157+
location_content = dumps(bad_url_location, indent=4)
121158
print(f'\nBad url status: {status_content}')
122159
print(f'\nBad url locations: {location_content}')
123160

0 commit comments

Comments
 (0)