11#!/usr/bin/env python
2- import multiprocessing as mp
32import os
4- import json
5- import uuid
3+ from argparse import ArgumentParser
64from concurrent import futures
75from collections import defaultdict
6+ from functools import partial
7+ from json import dumps
8+ from multiprocessing import cpu_count
9+ from sys import argv
10+ from uuid import uuid4
811
9- from bs4 import BeautifulSoup
10- from markdown import markdown
1112import requests
1213import urllib3
14+ from bs4 import BeautifulSoup
15+ from markdown import markdown
1316
1417
1518# Ignore security hazard since certs SHOULD be trusted (https)
1619urllib3 .disable_warnings (urllib3 .exceptions .InsecureRequestWarning )
1720
1821# Avoid rate limiting (tcp)
19- URL_BOT_ID = f'Bot { str (uuid . uuid4 ())} '
22+ URL_BOT_ID = f'Bot { str (uuid4 ())} '
2023
2124
2225def extract_urls_from_html (content ):
@@ -35,7 +38,7 @@ def extract_urls(discover_path):
3538 max_strlen = - 1
3639 for root , dirs , files in os .walk (discover_path , topdown = True ):
3740 dirs [:] = [d for d in dirs if d not in exclude ]
38- short_root = root .lstrip (discover_path )
41+ short_root = root .replace (discover_path , '' )
3942 for file in files :
4043 output = f'Currently checking: file={ file } '
4144 file_path = os .path .join (root , file )
@@ -55,38 +58,41 @@ def extract_urls(discover_path):
5558 return all_urls
5659
5760
58- def run_workers (work , data , worker_threads = mp .cpu_count ()* 4 ):
59- with futures .ThreadPoolExecutor (max_workers = worker_threads ) as executor :
61+ def run_workers (work , data , threads , ** kwargs ):
62+ work_partial = partial (work , ** kwargs )
63+ with futures .ThreadPoolExecutor (max_workers = threads ) as executor :
6064 future_to_result = {
61- executor .submit (work , arg ): arg for arg in data }
65+ executor .submit (work_partial , arg ): arg
66+ for arg in data
67+ }
6268 for future in futures .as_completed (future_to_result ):
6369 yield future .result ()
6470
6571
66- def get_url_status (url ):
72+ def get_url_status (url , timeout , retries ):
6773 for local in ('localhost' , '127.0.0.1' , 'app_server' ):
6874 if url .startswith ('http://' + local ):
6975 return (url , 0 )
7076 clean_url = url .strip ('?.' )
7177 try :
7278 with requests .Session () as session :
73- adapter = requests .adapters .HTTPAdapter (max_retries = 10 )
79+ adapter = requests .adapters .HTTPAdapter (max_retries = retries )
7480 session .mount ('http://' , adapter )
7581 session .mount ('https://' , adapter )
7682 response = session .get (
77- clean_url , verify = False , timeout = 10.0 ,
83+ clean_url , verify = False , timeout = timeout ,
7884 headers = {'User-Agent' : URL_BOT_ID })
7985 return (clean_url , response .status_code )
8086 except requests .exceptions .Timeout :
8187 return (clean_url , 504 )
82- except requests .exceptions .ConnectionError :
83- return (clean_url , - 1 )
8488 except requests .exceptions .TooManyRedirects :
89+ return (clean_url , - 301 )
90+ except requests .exceptions .ConnectionError :
8591 return (clean_url , - 1 )
8692
8793
8894def bad_url (url_status ):
89- if url_status == - 1 :
95+ if url_status == - 301 or url_status == - 1 :
9096 return True
9197 elif url_status == 401 or url_status == 403 :
9298 return False
@@ -97,15 +103,46 @@ def bad_url(url_status):
97103 return False
98104
99105
106+ def parse_args (argv ):
107+ parser = ArgumentParser (
108+ description = 'Check for bad urls in the HTML content.' ,
109+ add_help = True )
110+ parser .add_argument (
111+ '-timeout' , '--url-timeout' ,
112+ default = 10.0 ,
113+ type = float ,
114+ dest = 'timeout' ,
115+ help = 'Timeout in seconds to wait for url' )
116+ parser .add_argument (
117+ '-retries' , '--url-retries' ,
118+ default = 5 ,
119+ type = int ,
120+ dest = 'retries' ,
121+ help = 'Number of url retries' )
122+ parser .add_argument (
123+ '-threads' , '--num-threads' ,
124+ default = cpu_count ()* 4 ,
125+ type = int ,
126+ dest = 'threads' ,
127+ help = 'Number of threads to run with' )
128+ return parser .parse_args (argv )
129+
130+
100131def main ():
132+ args = parse_args (argv [1 :])
101133 print ('Extract urls...' )
102- all_urls = extract_urls (os .getcwd () + os . path . sep + 'content' )
134+ all_urls = extract_urls (os .getcwd ())
103135 print ('\n Check urls...' )
104136 bad_url_status = {}
105137 url_id = 1
106138 max_strlen = - 1
107- for url_path , url_status in run_workers (get_url_status , all_urls .keys ()):
108- output = f'Currently checking: id={ url_id } host={ urllib3 .util .parse_url (url_path ).host } '
139+ for url_path , url_status in run_workers (
140+ get_url_status , all_urls .keys (),
141+ threads = args .threads , timeout = args .timeout , retries = args .retries ):
142+ output = (
143+ f'Currently checking: id={ url_id } '
144+ f'host={ urllib3 .util .parse_url (url_path ).host } '
145+ )
109146 if max_strlen < len (output ):
110147 max_strlen = len (output )
111148 print (output .ljust (max_strlen ), end = '\r ' )
@@ -116,8 +153,8 @@ def main():
116153 bad_url : all_urls [bad_url ]
117154 for bad_url in bad_url_status
118155 }
119- status_content = json . dumps (bad_url_status , indent = 4 )
120- location_content = json . dumps (bad_url_location , indent = 4 )
156+ status_content = dumps (bad_url_status , indent = 4 )
157+ location_content = dumps (bad_url_location , indent = 4 )
121158 print (f'\n Bad url status: { status_content } ' )
122159 print (f'\n Bad url locations: { location_content } ' )
123160
0 commit comments