1+ #!/usr/bin/env python
2+ """Spider to try and find bugs in the parser. Requires httplib2 and elementtree
3+
4+ usage:
5+ import spider
6+ s = spider.Spider()
7+ s.spider("http://www.google.com", maxURLs=100)
8+ """
9+
10+ import urllib2
11+ import urlparse
12+ import robotparser
13+ import md5
14+
15+ import httplib2
16+
17+ import html5lib
18+ from html5lib .treebuilders import etree
19+
20+ class Spider (object ):
21+ def __init__ (self ):
22+ self .unvisitedURLs = set ()
23+ self .visitedURLs = set ()
24+ self .buggyURLs = set ()
25+ self .robotParser = robotparser .RobotFileParser ()
26+ self .contentDigest = {}
27+ self .http = httplib2 .Http (".cache" )
28+
29+ def run (self , initialURL , maxURLs = 1000 ):
30+ urlNumber = 0
31+ self .visitedURLs .add (initialURL )
32+ content = self .loadURL (initialURL )
33+ while maxURLs == None or urlNumber < maxURLs :
34+ if content is not None :
35+ self .parse (content )
36+ urlNumber += 1
37+ if not self .unvisitedURLs :
38+ break
39+ content = self .loadURL (self .unvisitedURLs .pop ())
40+
41+ def parse (self , content ):
42+ failed = False
43+ p = html5lib .HTMLParser (tree = etree .TreeBuilder )
44+ try :
45+ tree = p .parse (content )
46+ except :
47+ self .buggyURLs .add (self .currentURL )
48+ failed = True
49+ print "BUGGY:" , self .currentURL
50+ self .visitedURLs .add (self .currentURL )
51+ if not failed :
52+ self .updateURLs (tree )
53+
54+ def loadURL (self , url ):
55+ resp , content = self .http .request (url , "GET" )
56+ self .currentURL = url
57+ digest = md5 .md5 (content ).hexdigest ()
58+ if digest in self .contentDigest :
59+ content = None
60+ self .visitedURLs .add (url )
61+ else :
62+ self .contentDigest [digest ] = url
63+
64+ if resp ['status' ] != "200" :
65+ content = None
66+
67+ return content
68+
69+ def updateURLs (self , tree ):
70+ """Take all the links in the current document, extract the URLs and
71+ update the list of visited and unvisited URLs according to whether we
72+ have seen them before or not"""
73+ urls = set ()
74+ #Remove all links we have already visited
75+ for link in tree .findall (".//a" ):
76+ try :
77+ url = urlparse .urldefrag (link .attrib ['href' ])[0 ]
78+ if (url and url not in self .unvisitedURLs and url
79+ not in self .visitedURLs ):
80+ urls .add (url )
81+ except KeyError :
82+ pass
83+
84+ #Remove all non-http URLs and add a sutiable base URL where that is
85+ #missing
86+ newUrls = set ()
87+ for url in urls :
88+ splitURL = list (urlparse .urlsplit (url ))
89+ if splitURL [0 ] != "http" :
90+ continue
91+ if splitURL [1 ] == "" :
92+ splitURL [1 ] = urlparse .urlsplit (self .currentURL )[1 ]
93+ newUrls .add (urlparse .urlunsplit (splitURL ))
94+ urls = newUrls
95+
96+ responseHeaders = {}
97+ #Now we want to find the content types of the links we haven't visited
98+ for url in urls :
99+ try :
100+ resp , content = self .http .request (url , "HEAD" )
101+ responseHeaders [url ] = resp
102+ except AttributeError :
103+ #Don't know why this happens
104+ pass
105+
106+
107+ #Remove links not of content-type html or pages not found
108+ #XXX - need to deal with other status codes?
109+ toVisit = set ([url for url in urls if url in responseHeaders and
110+ "html" in responseHeaders [url ]['content-type' ] and
111+ responseHeaders [url ]['status' ] == "200" ])
112+
113+ #Now check we are allowed to spider the page
114+ for url in toVisit :
115+ robotURL = list (urlparse .urlsplit (url )[:2 ])
116+ robotURL .extend (["robots.txt" , "" , "" ])
117+ robotURL = urlparse .urlunsplit (robotURL )
118+ self .robotParser .set_url (robotURL )
119+ if not self .robotParser .can_fetch ("*" , url ):
120+ toVisit .remove (url )
121+
122+ self .visitedURLs .update (urls )
123+ self .unvisitedURLs .update (toVisit )
0 commit comments