Skip to content

Commit 883a02a

Browse files
jcharummitechie
authored andcommitted
Add a regression for a multi-page nytimes article
It does not quite work yet, as we wrongly pull in page 2 at the end of the article due to yet-to-be-implemented duplicate avoidance. Conflicts: src/readability_lxml/readability.py src/tests/gen_test.py src/tests/regression.py
1 parent cfc6f94 commit 883a02a

File tree

10 files changed

+4857
-29
lines changed

10 files changed

+4857
-29
lines changed

src/readability_lxml/readability.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,7 @@ def eval_href(parsed_urls, url, base_url, link):
658658

659659
# If we've already seen this page, ignore it.
660660
if href == base_url or href == url or href in parsed_urls:
661+
log.debug('rejecting %s: already seen page' % href)
661662
return raw_href, href, False
662663

663664
# If it's on a different domain, skip it.
@@ -736,15 +737,20 @@ def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
736737
candidate.score += 25
737738

738739
if REGEXES['firstLast'].search(link_data):
740+
# If we already matched on "next", last is probably fine. If we didn't,
741+
# then it's bad. Penalize.
739742
if not REGEXES['nextLink'].search(candidate.link_text):
743+
log.debug('link_data matched last but not next')
740744
candidate.score -= 65
741745

742746
neg_re = REGEXES['negativeRe']
743747
ext_re = REGEXES['extraneous']
744748
if neg_re.search(link_data) or ext_re.search(link_data):
749+
log.debug('link_data negative/extraneous regex match')
745750
candidate.score -= 50
746751

747752
if REGEXES['prevLink'].search(link_data):
753+
log.debug('link_data prevLink match')
748754
candidate.score -= 200
749755

750756
parent = link.getparent()
@@ -756,11 +762,13 @@ def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
756762
parent_class_and_id = ' '.join([parent_class, parent_id])
757763
if not positive_node_match:
758764
if REGEXES['page'].search(parent_class_and_id):
765+
log.debug('positive ancestor match')
759766
positive_node_match = True
760767
candidate.score += 25
761768
if not negative_node_match:
762769
if REGEXES['negativeRe'].search(parent_class_and_id):
763770
if not REGEXES['positiveRe'].search(parent_class_and_id):
771+
log.debug('negative ancestor match')
764772
negative_node_match = True
765773
candidate.score -= 25
766774
parent = parent.getparent()
@@ -770,11 +778,13 @@ def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
770778
candidate.score += 25
771779

772780
if REGEXES['extraneous'].search(href):
781+
log.debug('extraneous regex match')
773782
candidate.score -= 15
774783

775784
try:
776785
link_text_as_int = int(link_text)
777786

787+
log.debug('link_text looks like %d' % link_text_as_int)
778788
# Punish 1 since we're either already there, or it's probably before
779789
# what we want anyways.
780790
if link_text_as_int == 1:
@@ -784,6 +794,8 @@ def eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link):
784794
except ValueError as exc:
785795
pass
786796

797+
log.debug('final score is %d' % candidate.score)
798+
787799
def find_next_page_url(parsed_urls, url, elem):
788800
links = tags(elem, 'a')
789801
base_url = find_base_url(url)
@@ -830,15 +842,12 @@ def append_next_page(parsed_urls, page_url, doc, options):
830842
if doc.tag == 'html':
831843
children = doc.getchildren()
832844
if children[0].tag == 'head':
833-
import ipdb; ipdb.set_trace()
834845
for elem in page_doc:
835846
doc.getchildren()[1].append(elem)
836847
else:
837-
import ipdb; ipdb.set_trace()
838848
for elem in page_doc:
839849
doc.getchildren()[0].append(elem)
840850
else:
841-
import ipdb; ipdb.set_trace()
842851
for elem in page_doc:
843852
doc.append(elem)
844853
if next_page_url is not None:

src/tests/gen_test.py

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,24 @@
44
and construct a new test case.
55
66
"""
7+
from regression_test import (
8+
TEST_DATA_PATH,
9+
ORIGINAL_SUFFIX,
10+
READABLE_SUFFIX,
11+
YAML_EXTENSION,
12+
adjust_url_map,
13+
read_yaml
14+
)
715
import argparse
816
import errno
917
import os
1018
import os.path
1119
import sys
12-
import test
1320
import urllib2
1421
import yaml
1522

1623
from readability_lxml import readability
24+
from readability_lxml import urlfetch
1725

1826

1927
OVERWRITE_QUESTION = '%s exists; overwrite and continue (y/n)? '
@@ -27,7 +35,7 @@ def y_or_n(question):
2735

2836

2937
def write_file(test_name, suffix, data):
30-
path = os.path.join(test.TEST_DATA_PATH, test_name + suffix)
38+
path = os.path.join(TEST_DATA_PATH, test_name + suffix)
3139
mode = 0644
3240
try:
3341
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
@@ -44,21 +52,22 @@ def write_file(test_name, suffix, data):
4452
return True
4553

4654

47-
def write_original(test_name, url):
48-
orig = urllib2.urlopen(url).read()
49-
return write_file(test_name, test.ORIGINAL_SUFFIX, orig)
55+
def write_original(test_name, orig):
56+
return write_file(test_name, ORIGINAL_SUFFIX, orig)
57+
5058

51-
def write_readable(test_name, orig):
52-
rdbl_doc = readability.Document(orig)
59+
def write_readable(test_name, orig, options):
60+
rdbl_doc = readability.Document(orig, **options)
5361
summary = rdbl_doc.summary()
54-
return write_file(test_name, test.READABLE_SUFFIX, summary.html)
62+
return write_file(test_name, READABLE_SUFFIX, summary.html)
63+
5564

5665
def read_spec(test_name):
5766
yaml_path = os.path.join(
58-
test.TEST_DATA_PATH,
59-
test_name + test.YAML_EXTENSION
67+
TEST_DATA_PATH,
68+
test_name + YAML_EXTENSION
6069
)
61-
return test.read_yaml(yaml_path)
70+
return read_yaml(yaml_path)
6271

6372
def read_orig(test_name, url = None):
6473
"""
@@ -69,39 +78,44 @@ def read_orig(test_name, url = None):
6978
"""
7079
if url:
7180
orig = urllib2.urlopen(url).read()
72-
write_result = write_file(test_name, test.ORIGINAL_SUFFIX, orig)
81+
write_result = write_file(test_name, ORIGINAL_SUFFIX, orig)
7382
return orig, write_result
7483
else:
7584
orig_path = os.path.join(
76-
test.TEST_DATA_PATH,
77-
test_name + test.ORIGINAL_SUFFIX
85+
TEST_DATA_PATH,
86+
test_name + ORIGINAL_SUFFIX
7887
)
7988
orig = open(orig_path).read()
8089
return orig, True
8190

8291
def create(args):
92+
# TODO: Make this work for multi-page articles.
8393
spec_dict = {'url': args.url, 'test_description': args.test_description}
8494
spec = yaml.dump(spec_dict, default_flow_style = False)
85-
if not write_file(args.test_name, test.YAML_EXTENSION, spec):
95+
if not write_file(args.test_name, YAML_EXTENSION, spec):
8696
return False
87-
if not write_original(args.test_name, args.url):
97+
orig = urllib2.urlopen(url).read()
98+
if not write_original(args.test_name, orig):
8899
return False
89-
if not write_readable(args.test_name, args.url):
100+
if not write_readable(args.test_name, orig):
90101
return False
91102
return True
92103

93104
def genbench(args):
105+
spec_dict = read_spec(args.test_name)
94106
if args.refetch:
95-
spec_dict = read_spec(args.test_name)
96107
url = spec_dict['url']
97108
else:
98109
url = None
110+
url_map = adjust_url_map(spec_dict.get('url_map', dict()))
111+
fetcher = urlfetch.MockUrlFetch(url_map)
112+
options = {'url': spec_dict['url'], 'urlfetch': fetcher}
99113
orig, success = read_orig(args.test_name, url)
100114
if not success:
101115
return False
102-
rdbl_doc = readability.Document(orig)
116+
rdbl_doc = readability.Document(orig, **options)
103117
summary = rdbl_doc.summary()
104-
if not write_file(args.test_name, test.READABLE_SUFFIX, summary.html):
118+
if not write_file(args.test_name, READABLE_SUFFIX, summary.html):
105119
return False
106120
return True
107121

src/tests/regression.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,17 @@ def make_path(dir_path, name, suffix):
149149
return os.path.join(dir_path, ''.join([name, suffix]))
150150

151151

152+
def adjust_url_map(url_map):
153+
adjusted = dict()
154+
for k, v in url_map.items():
155+
adjusted[k] = os.path.join(TEST_DATA_PATH, v)
156+
return adjusted
157+
158+
152159
def make_readability_test(dir_path, name, spec_dict):
153160
enabled = spec_dict.get('enabled', True)
154161
notes = spec_dict.get('notes', '')
155-
url_map = spec_dict.get('url_map', dict())
162+
url_map = adjust_url_map(spec_dict.get('url_map', dict()))
156163
return ReadabilityTest(
157164
dir_path,
158165
enabled,

0 commit comments

Comments
 (0)