11#!/usr/bin/env python
2- from concurrent import futures
32import multiprocessing as mp
43import os
54import json
65import uuid
6+ from concurrent import futures
7+ from collections import defaultdict
78
89from bs4 import BeautifulSoup
910from markdown import markdown
1819URL_BOT_ID = f'Bot { str (uuid .uuid4 ())} '
1920
2021
21- def extract_urls_from_html (content , all_urls ):
22+ def extract_urls_from_html (content ):
2223 soup = BeautifulSoup (content , 'html.parser' )
24+ html_urls = set ()
2325 for a in soup .find_all ('a' , href = True ):
2426 url = a ['href' ]
2527 if url .startswith ('http' ):
26- all_urls .add (url )
28+ html_urls .add (url )
29+ return html_urls
2730
2831
2932def extract_urls (discover_path ):
3033 exclude = ['.git' , '.vscode' ]
31- all_urls = set ( )
34+ all_urls = defaultdict ( list )
3235 max_strlen = - 1
3336 for root , dirs , files in os .walk (discover_path , topdown = True ):
3437 dirs [:] = [d for d in dirs if d not in exclude ]
38+ short_root = root .lstrip (discover_path )
3539 for file in files :
3640 output = f'Currently checking: file={ file } '
3741 file_path = os .path .join (root , file )
@@ -40,10 +44,14 @@ def extract_urls(discover_path):
4044 print (output .ljust (max_strlen ), end = '\r ' )
4145 if file_path .endswith ('.html' ):
4246 content = open (file_path )
43- extract_urls_from_html (content , all_urls )
47+ extract_urls_from_html (content )
4448 elif file_path .endswith ('.markdown' ):
4549 content = markdown (open (file_path ).read ())
46- extract_urls_from_html (content , all_urls )
50+ else :
51+ continue
52+ html_urls = extract_urls_from_html (content )
53+ for url in html_urls :
54+ all_urls [url ].append (os .path .join (short_root , file ))
4755 return all_urls
4856
4957
@@ -87,20 +95,27 @@ def bad_url(url_status):
8795
8896def main ():
8997 print ('Extract urls...' )
90- all_urls = extract_urls (os .getcwd ())
98+ all_urls = extract_urls (os .getcwd () + os . path . sep + 'content' )
9199 print ('\n Check urls...' )
92- bad_urls = {}
100+ bad_url_status = {}
93101 url_id = 1
94102 max_strlen = - 1
95- for url_path , url_status in run_workers (get_url_status , all_urls ):
103+ for url_path , url_status in run_workers (get_url_status , all_urls . keys () ):
96104 output = f'Currently checking: id={ url_id } host={ urllib3 .util .parse_url (url_path ).host } '
97105 if max_strlen < len (output ):
98106 max_strlen = len (output )
99107 print (output .ljust (max_strlen ), end = '\r ' )
100108 if bad_url (url_status ) is True :
101- bad_urls [url_path ] = url_status
109+ bad_url_status [url_path ] = url_status
102110 url_id += 1
103- print (f'\n Bad urls: { json .dumps (bad_urls , indent = 4 )} ' )
111+ bad_url_location = {
112+ bad_url : all_urls [bad_url ]
113+ for bad_url in bad_url_status
114+ }
115+ status_content = json .dumps (bad_url_status , indent = 4 )
116+ location_content = json .dumps (bad_url_location , indent = 4 )
117+ print (f'\n Bad url status: { status_content } ' )
118+ print (f'\n Bad url locations: { location_content } ' )
104119
105120
106121if __name__ == '__main__' :
0 commit comments