python26
diff --git a/‎src/readability_lxml/readability.py‎
Lines changed: 12 additions & 3 deletions b/‎src/readability_lxml/readability.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎src/tests/gen_test.py‎
Lines changed: 34 additions & 20 deletions b/‎src/tests/gen_test.py‎
Lines changed: 34 additions & 20 deletions
diff --git a/‎src/tests/regression.py‎
Lines changed: 8 additions & 1 deletion b/‎src/tests/regression.py‎
Lines changed: 8 additions & 1 deletion
@@ -658,6 +658,7 @@ def eval_href(parsed_urls, url, base_url, link):
 
     # If we've already seen this page, ignore it.
     if href == base_url or href == url or href in parsed_urls:
+        log.debug('rejecting %s: already seen page' % href)
         return raw_href, href, False
 
     # If it's on a different domain, skip it.
@@ -736,15 +737,20 @@ def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
         candidate.score += 25
 
     if REGEXES['firstLast'].search(link_data):
+        # If we already matched on "next", last is probably fine. If we didn't,
+        # then it's bad.  Penalize.
         if not REGEXES['nextLink'].search(candidate.link_text):
+            log.debug('link_data matched last but not next')
             candidate.score -= 65
 
     neg_re = REGEXES['negativeRe']
     ext_re = REGEXES['extraneous']
     if neg_re.search(link_data) or ext_re.search(link_data):
+        log.debug('link_data negative/extraneous regex match')
         candidate.score -= 50
 
     if REGEXES['prevLink'].search(link_data):
+        log.debug('link_data prevLink match')
         candidate.score -= 200
 
     parent = link.getparent()
@@ -756,11 +762,13 @@ def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
         parent_class_and_id = ' '.join([parent_class, parent_id])
         if not positive_node_match:
             if REGEXES['page'].search(parent_class_and_id):
+                log.debug('positive ancestor match')
                 positive_node_match = True
                 candidate.score += 25
         if not negative_node_match:
             if REGEXES['negativeRe'].search(parent_class_and_id):
                 if not REGEXES['positiveRe'].search(parent_class_and_id):
+                    log.debug('negative ancestor match')
                     negative_node_match = True
                     candidate.score -= 25
         parent = parent.getparent()
@@ -770,11 +778,13 @@ def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
         candidate.score += 25
 
     if REGEXES['extraneous'].search(href):
+        log.debug('extraneous regex match')
         candidate.score -= 15
 
     try:
         link_text_as_int = int(link_text)
 
+        log.debug('link_text looks like %d' % link_text_as_int)
         # Punish 1 since we're either already there, or it's probably before
         # what we want anyways.
         if link_text_as_int == 1:
@@ -784,6 +794,8 @@ def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
     except ValueError as exc:
         pass
 
+    log.debug('final score is %d' % candidate.score)
+
 def find_next_page_url(parsed_urls, url, elem):
     links = tags(elem, 'a')
     base_url = find_base_url(url)
@@ -830,15 +842,12 @@ def append_next_page(parsed_urls, page_url, doc, options):
         if doc.tag == 'html':
             children = doc.getchildren()
             if children[0].tag == 'head':
-                import ipdb; ipdb.set_trace()
                 for elem in page_doc:
                     doc.getchildren()[1].append(elem)
             else:
-                import ipdb; ipdb.set_trace()
                 for elem in page_doc:
                     doc.getchildren()[0].append(elem)
         else:
-            import ipdb; ipdb.set_trace()
             for elem in page_doc:
                 doc.append(elem)
     if next_page_url is not None:
 
@@ -4,16 +4,24 @@
 and construct a new test case.
 
 """
+from regression_test import (
+        TEST_DATA_PATH,
+        ORIGINAL_SUFFIX,
+        READABLE_SUFFIX,
+        YAML_EXTENSION,
+        adjust_url_map,
+        read_yaml
+        )
 import argparse
 import errno
 import os
 import os.path
 import sys
-import test
 import urllib2
 import yaml
 
 from readability_lxml import readability
+from readability_lxml import urlfetch
 
 
 OVERWRITE_QUESTION = '%s exists; overwrite and continue (y/n)? '
@@ -27,7 +35,7 @@ def y_or_n(question):
 
 
 def write_file(test_name, suffix, data):
-    path = os.path.join(test.TEST_DATA_PATH, test_name + suffix)
+    path = os.path.join(TEST_DATA_PATH, test_name + suffix)
     mode = 0644
     try:
         fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
@@ -44,21 +52,22 @@ def write_file(test_name, suffix, data):
     return True
 
 
-def write_original(test_name, url):
-    orig = urllib2.urlopen(url).read()
-    return write_file(test_name, test.ORIGINAL_SUFFIX, orig)
+def write_original(test_name, orig):
+    return write_file(test_name, ORIGINAL_SUFFIX, orig)
+
 
-def write_readable(test_name, orig):
-    rdbl_doc = readability.Document(orig)
+def write_readable(test_name, orig, options):
+    rdbl_doc = readability.Document(orig, **options)
     summary = rdbl_doc.summary()
-    return write_file(test_name, test.READABLE_SUFFIX, summary.html)
+    return write_file(test_name, READABLE_SUFFIX, summary.html)
+
 
 def read_spec(test_name):
     yaml_path = os.path.join(
-            test.TEST_DATA_PATH,
-            test_name + test.YAML_EXTENSION
+            TEST_DATA_PATH,
+            test_name + YAML_EXTENSION
             )
-    return test.read_yaml(yaml_path)
+    return read_yaml(yaml_path)
 
 def read_orig(test_name, url = None):
     """
@@ -69,39 +78,44 @@ def read_orig(test_name, url = None):
     """
     if url:
         orig = urllib2.urlopen(url).read()
-        write_result = write_file(test_name, test.ORIGINAL_SUFFIX, orig)
+        write_result = write_file(test_name, ORIGINAL_SUFFIX, orig)
         return orig, write_result
     else:
         orig_path = os.path.join(
-                test.TEST_DATA_PATH,
-                test_name + test.ORIGINAL_SUFFIX
+                TEST_DATA_PATH,
+                test_name + ORIGINAL_SUFFIX
                 )
         orig = open(orig_path).read()
         return orig, True
 
 def create(args):
+    # TODO: Make this work for multi-page articles.
     spec_dict = {'url': args.url, 'test_description': args.test_description}
     spec = yaml.dump(spec_dict, default_flow_style = False)
-    if not write_file(args.test_name, test.YAML_EXTENSION, spec):
+    if not write_file(args.test_name, YAML_EXTENSION, spec):
         return False
-    if not write_original(args.test_name, args.url):
+    orig = urllib2.urlopen(url).read()
+    if not write_original(args.test_name, orig):
         return False
-    if not write_readable(args.test_name, args.url):
+    if not write_readable(args.test_name, orig):
         return False
     return True
 
 def genbench(args):
+    spec_dict = read_spec(args.test_name)
     if args.refetch:
-        spec_dict = read_spec(args.test_name)
         url = spec_dict['url']
     else:
         url = None
+    url_map = adjust_url_map(spec_dict.get('url_map', dict()))
+    fetcher = urlfetch.MockUrlFetch(url_map)
+    options = {'url': spec_dict['url'], 'urlfetch': fetcher}
     orig, success = read_orig(args.test_name, url)
     if not success:
         return False
-    rdbl_doc = readability.Document(orig)
+    rdbl_doc = readability.Document(orig, **options)
     summary = rdbl_doc.summary()
-    if not write_file(args.test_name, test.READABLE_SUFFIX, summary.html):
+    if not write_file(args.test_name, READABLE_SUFFIX, summary.html):
         return False
     return True
 
 
@@ -149,10 +149,17 @@ def make_path(dir_path, name, suffix):
     return os.path.join(dir_path, ''.join([name, suffix]))
 
 
+def adjust_url_map(url_map):
+    adjusted = dict()
+    for k, v in url_map.items():
+        adjusted[k] = os.path.join(TEST_DATA_PATH, v)
+    return adjusted
+
+
 def make_readability_test(dir_path, name, spec_dict):
     enabled = spec_dict.get('enabled', True)
     notes = spec_dict.get('notes', '')
-    url_map = spec_dict.get('url_map', dict())
+    url_map = adjust_url_map(spec_dict.get('url_map', dict()))
     return ReadabilityTest(
             dir_path,
             enabled,