Skip to content

Commit c5e5eac

Browse files
committed
Embed argparse params into program
1 parent 080319c commit c5e5eac

File tree

1 file changed

+22
-13
lines changed

1 file changed

+22
-13
lines changed

check_urls.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
import os
55
import json
66
import uuid
7+
import sys
78
from concurrent import futures
89
from collections import defaultdict
10+
from functools import partial
911

1012
from bs4 import BeautifulSoup
1113
from markdown import markdown
@@ -56,26 +58,29 @@ def extract_urls(discover_path):
5658
return all_urls
5759

5860

59-
def run_workers(work, data, worker_threads=mp.cpu_count()*4):
60-
with futures.ThreadPoolExecutor(max_workers=worker_threads) as executor:
61+
def run_workers(work, data, threads, **kwargs):
62+
work_partial = partial(work, **kwargs)
63+
with futures.ThreadPoolExecutor(max_workers=threads) as executor:
6164
future_to_result = {
62-
executor.submit(work, arg): arg for arg in data}
65+
executor.submit(work_partial, arg): arg
66+
for arg in data
67+
}
6368
for future in futures.as_completed(future_to_result):
6469
yield future.result()
6570

6671

67-
def get_url_status(url):
72+
def get_url_status(url, timeout, retries):
6873
for local in ('localhost', '127.0.0.1', 'app_server'):
6974
if url.startswith('http://' + local):
7075
return (url, 0)
7176
clean_url = url.strip('?.')
7277
try:
7378
with requests.Session() as session:
74-
adapter = requests.adapters.HTTPAdapter(max_retries=10)
79+
adapter = requests.adapters.HTTPAdapter(max_retries=retries)
7580
session.mount('http://', adapter)
7681
session.mount('https://', adapter)
7782
response = session.get(
78-
clean_url, verify=False, timeout=10.0,
83+
clean_url, verify=False, timeout=timeout,
7984
headers={'User-Agent': URL_BOT_ID})
8085
return (clean_url, response.status_code)
8186
except requests.exceptions.Timeout:
@@ -98,22 +103,24 @@ def bad_url(url_status):
98103
return False
99104

100105

101-
102106
def parse_args(argv):
103107
parser = argparse.ArgumentParser(
104108
description='Check correctness of url links.',
105109
add_help=True)
106110
parser.add_argument(
107-
'url-timeout',
111+
'--url-timeout',
108112
default=10.0,
109-
help='Timeout in seconds to wait for link')
113+
dest='timeout',
114+
help='Timeout in seconds to wait for url')
110115
parser.add_argument(
111-
'url-retries',
116+
'--url-retries',
112117
default=10,
113-
help='Number of link retries')
118+
dest='retries',
119+
help='Number of url retries')
114120
parser.add_argument(
115-
'num-threads',
121+
'--num-threads',
116122
default=mp.cpu_count()*4,
123+
dest='threads',
117124
help='Number of threads to run with')
118125
return parser.parse_args(argv)
119126

@@ -126,7 +133,9 @@ def main():
126133
bad_url_status = {}
127134
url_id = 1
128135
max_strlen = -1
129-
for url_path, url_status in run_workers(get_url_status, all_urls.keys()):
136+
for url_path, url_status in run_workers(
137+
get_url_status, all_urls.keys(),
138+
threads=args.threads, timeout=args.timeout, retries=args.retries):
130139
output = f'Currently checking: id={url_id} host={urllib3.util.parse_url(url_path).host}'
131140
if max_strlen < len(output):
132141
max_strlen = len(output)

0 commit comments

Comments
 (0)