44import os
55import json
66import uuid
7+ import sys
78from concurrent import futures
89from collections import defaultdict
10+ from functools import partial
911
1012from bs4 import BeautifulSoup
1113from markdown import markdown
@@ -56,26 +58,29 @@ def extract_urls(discover_path):
5658 return all_urls
5759
5860
59- def run_workers (work , data , worker_threads = mp .cpu_count ()* 4 ):
60- with futures .ThreadPoolExecutor (max_workers = worker_threads ) as executor :
61+ def run_workers (work , data , threads , ** kwargs ):
62+ work_partial = partial (work , ** kwargs )
63+ with futures .ThreadPoolExecutor (max_workers = threads ) as executor :
6164 future_to_result = {
62- executor .submit (work , arg ): arg for arg in data }
65+ executor .submit (work_partial , arg ): arg
66+ for arg in data
67+ }
6368 for future in futures .as_completed (future_to_result ):
6469 yield future .result ()
6570
6671
67- def get_url_status (url ):
72+ def get_url_status (url , timeout , retries ):
6873 for local in ('localhost' , '127.0.0.1' , 'app_server' ):
6974 if url .startswith ('http://' + local ):
7075 return (url , 0 )
7176 clean_url = url .strip ('?.' )
7277 try :
7378 with requests .Session () as session :
74- adapter = requests .adapters .HTTPAdapter (max_retries = 10 )
79+ adapter = requests .adapters .HTTPAdapter (max_retries = retries )
7580 session .mount ('http://' , adapter )
7681 session .mount ('https://' , adapter )
7782 response = session .get (
78- clean_url , verify = False , timeout = 10.0 ,
83+ clean_url , verify = False , timeout = timeout ,
7984 headers = {'User-Agent' : URL_BOT_ID })
8085 return (clean_url , response .status_code )
8186 except requests .exceptions .Timeout :
@@ -98,22 +103,24 @@ def bad_url(url_status):
98103 return False
99104
100105
101-
102106def parse_args (argv ):
103107 parser = argparse .ArgumentParser (
104108 description = 'Check correctness of url links.' ,
105109 add_help = True )
106110 parser .add_argument (
107- 'url-timeout' ,
111+ '-- url-timeout' ,
108112 default = 10.0 ,
109- help = 'Timeout in seconds to wait for link' )
113+ dest = 'timeout' ,
114+ help = 'Timeout in seconds to wait for url' )
110115 parser .add_argument (
111- 'url-retries' ,
116+ '-- url-retries' ,
112117 default = 10 ,
113- help = 'Number of link retries' )
118+ dest = 'retries' ,
119+ help = 'Number of url retries' )
114120 parser .add_argument (
115- 'num-threads' ,
121+ '-- num-threads' ,
116122 default = mp .cpu_count ()* 4 ,
123+ dest = 'threads' ,
117124 help = 'Number of threads to run with' )
118125 return parser .parse_args (argv )
119126
@@ -126,7 +133,9 @@ def main():
126133 bad_url_status = {}
127134 url_id = 1
128135 max_strlen = - 1
129- for url_path , url_status in run_workers (get_url_status , all_urls .keys ()):
136+ for url_path , url_status in run_workers (
137+ get_url_status , all_urls .keys (),
138+ threads = args .threads , timeout = args .timeout , retries = args .retries ):
130139 output = f'Currently checking: id={ url_id } host={ urllib3 .util .parse_url (url_path ).host } '
131140 if max_strlen < len (output ):
132141 max_strlen = len (output )
0 commit comments