11#!/usr/bin/env python
2- import multiprocessing as mp
32import os
4- import json
5- import uuid
3+ from argparse import ArgumentParser
64from concurrent import futures
75from collections import defaultdict
6+ from functools import partial
7+ from json import dumps
8+ from multiprocessing import cpu_count
9+ from sys import argv
10+ from uuid import uuid4
811
9- from bs4 import BeautifulSoup
10- from markdown import markdown
1112import requests
1213import urllib3
14+ from bs4 import BeautifulSoup
15+ from markdown import markdown
1316
1417
1518# Ignore security hazard since certs SHOULD be trusted (https)
1619urllib3 .disable_warnings (urllib3 .exceptions .InsecureRequestWarning )
1720
1821# Avoid rate limiting (tcp)
19- URL_BOT_ID = f'Bot { str (uuid . uuid4 ())} '
22+ URL_BOT_ID = f'Bot { str (uuid4 ())} '
2023
2124
2225def extract_urls_from_html (content ):
@@ -35,7 +38,7 @@ def extract_urls(discover_path):
3538 max_strlen = - 1
3639 for root , dirs , files in os .walk (discover_path , topdown = True ):
3740 dirs [:] = [d for d in dirs if d not in exclude ]
38- short_root = root .lstrip (discover_path )
41+ short_root = root .replace (discover_path , '' )
3942 for file in files :
4043 output = f'Currently checking: file={ file } '
4144 file_path = os .path .join (root , file )
@@ -55,34 +58,41 @@ def extract_urls(discover_path):
5558 return all_urls
5659
5760
58- def run_workers (work , data , worker_threads = mp .cpu_count ()* 4 ):
59- with futures .ThreadPoolExecutor (max_workers = worker_threads ) as executor :
61+ def run_workers (work , data , threads , ** kwargs ):
62+ work_partial = partial (work , ** kwargs )
63+ with futures .ThreadPoolExecutor (max_workers = threads ) as executor :
6064 future_to_result = {
61- executor .submit (work , arg ): arg for arg in data }
65+ executor .submit (work_partial , arg ): arg
66+ for arg in data
67+ }
6268 for future in futures .as_completed (future_to_result ):
6369 yield future .result ()
6470
6571
66- def get_url_status (url ):
72+ def get_url_status (url , timeout , retries ):
6773 for local in ('localhost' , '127.0.0.1' , 'app_server' ):
6874 if url .startswith ('http://' + local ):
6975 return (url , 0 )
7076 clean_url = url .strip ('?.' )
7177 try :
72- response = requests .get (
73- clean_url , verify = False , timeout = 10.0 ,
74- headers = {'User-Agent' : URL_BOT_ID })
75- return (clean_url , response .status_code )
78+ with requests .Session () as session :
79+ adapter = requests .adapters .HTTPAdapter (max_retries = retries )
80+ session .mount ('http://' , adapter )
81+ session .mount ('https://' , adapter )
82+ response = session .get (
83+ clean_url , verify = False , timeout = timeout ,
84+ headers = {'User-Agent' : URL_BOT_ID })
85+ return (clean_url , response .status_code )
7686 except requests .exceptions .Timeout :
7787 return (clean_url , 504 )
78- except requests .exceptions .ConnectionError :
79- return (clean_url , - 1 )
8088 except requests .exceptions .TooManyRedirects :
89+ return (clean_url , - 301 )
90+ except requests .exceptions .ConnectionError :
8191 return (clean_url , - 1 )
8292
8393
8494def bad_url (url_status ):
85- if url_status == - 1 :
95+ if url_status == - 301 or url_status == - 1 :
8696 return True
8797 elif url_status == 401 or url_status == 403 :
8898 return False
@@ -93,15 +103,46 @@ def bad_url(url_status):
93103 return False
94104
95105
106+ def parse_args (argv ):
107+ parser = ArgumentParser (
108+ description = 'Check for bad urls in the HTML content.' ,
109+ add_help = True )
110+ parser .add_argument (
111+ '-timeout' , '--url-timeout' ,
112+ default = 10.0 ,
113+ type = float ,
114+ dest = 'timeout' ,
115+ help = 'Timeout in seconds to wait for url' )
116+ parser .add_argument (
117+ '-retries' , '--url-retries' ,
118+ default = 5 ,
119+ type = int ,
120+ dest = 'retries' ,
121+ help = 'Number of url retries' )
122+ parser .add_argument (
123+ '-threads' , '--num-threads' ,
124+ default = cpu_count ()* 4 ,
125+ type = int ,
126+ dest = 'threads' ,
127+ help = 'Number of threads to run with' )
128+ return parser .parse_args (argv )
129+
130+
96131def main ():
132+ args = parse_args (argv [1 :])
97133 print ('Extract urls...' )
98- all_urls = extract_urls (os .getcwd () + os . path . sep + 'content' )
134+ all_urls = extract_urls (os .getcwd ())
99135 print ('\n Check urls...' )
100136 bad_url_status = {}
101137 url_id = 1
102138 max_strlen = - 1
103- for url_path , url_status in run_workers (get_url_status , all_urls .keys ()):
104- output = f'Currently checking: id={ url_id } host={ urllib3 .util .parse_url (url_path ).host } '
139+ for url_path , url_status in run_workers (
140+ get_url_status , all_urls .keys (),
141+ threads = args .threads , timeout = args .timeout , retries = args .retries ):
142+ output = (
143+ f'Currently checking: id={ url_id } '
144+ f'host={ urllib3 .util .parse_url (url_path ).host } '
145+ )
105146 if max_strlen < len (output ):
106147 max_strlen = len (output )
107148 print (output .ljust (max_strlen ), end = '\r ' )
@@ -112,8 +153,8 @@ def main():
112153 bad_url : all_urls [bad_url ]
113154 for bad_url in bad_url_status
114155 }
115- status_content = json . dumps (bad_url_status , indent = 4 )
116- location_content = json . dumps (bad_url_location , indent = 4 )
156+ status_content = dumps (bad_url_status , indent = 4 )
157+ location_content = dumps (bad_url_location , indent = 4 )
117158 print (f'\n Bad url status: { status_content } ' )
118159 print (f'\n Bad url locations: { location_content } ' )
119160
0 commit comments