Add web spider to test for crashes on the real wild web

jgraham · jgraham · commit 1b1df63204bc · 2007-03-01T23:18:23.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40515
diff --git a/utils/spider.py b/utils/spider.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
+
+usage:
+import spider
+s = spider.Spider()
+s.spider("http://www.google.com", maxURLs=100)
+"""
+
+import urllib2
+import urlparse
+import robotparser
+import md5
+
+import httplib2
+
+import html5lib
+from html5lib.treebuilders import etree
+
+class Spider(object):
+    def __init__(self):
+        self.unvisitedURLs = set()
+        self.visitedURLs = set()
+        self.buggyURLs=set()
+        self.robotParser = robotparser.RobotFileParser()
+        self.contentDigest = {}
+        self.http = httplib2.Http(".cache")
+        
+    def run(self, initialURL, maxURLs=1000):
+        urlNumber = 0
+        self.visitedURLs.add(initialURL)
+        content = self.loadURL(initialURL)
+        while maxURLs == None or urlNumber < maxURLs:
+            if content is not None:
+                self.parse(content)
+                urlNumber += 1
+            if not self.unvisitedURLs:
+                break
+            content = self.loadURL(self.unvisitedURLs.pop())
+
+    def parse(self, content):
+        failed = False
+        p = html5lib.HTMLParser(tree=etree.TreeBuilder)
+        try:
+            tree = p.parse(content)
+        except:
+            self.buggyURLs.add(self.currentURL)
+            failed = True
+            print "BUGGY:", self.currentURL
+        self.visitedURLs.add(self.currentURL)
+        if not failed:
+            self.updateURLs(tree)
+    
+    def loadURL(self, url):
+        resp, content = self.http.request(url, "GET")
+        self.currentURL = url
+        digest = md5.md5(content).hexdigest()
+        if digest in self.contentDigest:
+            content = None
+            self.visitedURLs.add(url)
+        else:
+            self.contentDigest[digest] = url
+
+        if resp['status'] != "200":
+            content = None
+
+        return content
+    
+    def updateURLs(self, tree):
+        """Take all the links in the current document, extract the URLs and
+        update the list of visited and unvisited URLs according to whether we
+        have seen them before or not"""
+        urls = set()
+        #Remove all links we have already visited
+        for link in tree.findall(".//a"):
+                try:
+                    url = urlparse.urldefrag(link.attrib['href'])[0]
+                    if (url and url not in self.unvisitedURLs and url
+                        not in self.visitedURLs):
+                        urls.add(url)
+                except KeyError:
+                    pass
+        
+        #Remove all non-http URLs and add a sutiable base URL where that is
+        #missing
+        newUrls = set()
+        for url in urls:
+            splitURL = list(urlparse.urlsplit(url))
+            if splitURL[0] != "http":
+                continue
+            if splitURL[1] == "":
+                splitURL[1] = urlparse.urlsplit(self.currentURL)[1]
+            newUrls.add(urlparse.urlunsplit(splitURL))
+        urls = newUrls
+        
+        responseHeaders = {}
+        #Now we want to find the content types of the links we haven't visited
+        for url in urls:
+            try:
+                resp, content = self.http.request(url, "HEAD")
+                responseHeaders[url] = resp
+            except AttributeError:
+                #Don't know why this happens
+                pass
+            
+    
+        #Remove links not of content-type html or pages not found
+        #XXX - need to deal with other status codes?
+        toVisit = set([url for url in urls if url in responseHeaders and
+                      "html" in responseHeaders[url]['content-type'] and
+                      responseHeaders[url]['status'] == "200"])
+
+        #Now check we are allowed to spider the page
+        for url in toVisit:
+            robotURL = list(urlparse.urlsplit(url)[:2])
+            robotURL.extend(["robots.txt", "", ""])
+            robotURL = urlparse.urlunsplit(robotURL)
+            self.robotParser.set_url(robotURL)
+            if not self.robotParser.can_fetch("*", url):
+                toVisit.remove(url)
+    
+        self.visitedURLs.update(urls)
+        self.unvisitedURLs.update(toVisit)