Skip to content

Commit cc0af7a

Browse files
committed
Add beginnings of regression tests
1 parent 82eabfc commit cc0af7a

4 files changed

Lines changed: 247 additions & 1 deletion

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
*.pyc
2+
env

readability/readability.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,10 @@ def summary(self):
136136
confidence = 0;
137137
article = self.html.find('body') or self.html
138138

139-
cleaned_article = self.sanitize(article, candidates)
139+
unicode_cleaned_article = self.sanitize(article, candidates)
140+
cleaned_doc = fragment_fromstring(unicode_cleaned_article)
141+
cleaned_article = tostring(cleaned_doc)
142+
140143
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
141144
if ruthless and not of_acceptable_length:
142145
ruthless = False

test.py

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
from lxml.html import builder as B
2+
import lxml.html
3+
import lxml.html.diff
4+
import os
5+
import os.path
6+
import readability
7+
import sys
8+
import unittest
9+
import yaml
10+
11+
ORIGINAL_SUFFIX = '-orig.html'
12+
READABLE_SUFFIX = '-rdbl.html'
13+
RESULT_SUFFIX = '-result.html'
14+
15+
TEST_DATA_PATH = 'test_data'
16+
TEST_OUTPUT_PATH = 'test_output'
17+
18+
class ReadabilityTest:
19+
20+
def __init__(self, dir_path, name, orig_path, rdbl_path):
21+
self.dir_path = dir_path
22+
self.name = name
23+
self.orig_path = orig_path
24+
self.rdbl_path = rdbl_path
25+
26+
class ReadabilityTestData:
27+
28+
def __init__(self, test, orig_html, rdbl_html):
29+
self.test = test
30+
self.orig_html = orig_html
31+
self.rdbl_html = rdbl_html
32+
33+
class ReadabilityTestResult:
34+
35+
def __init__(self, test_data, result_html, diff_html):
36+
self.test_data = test_data
37+
self.result_html = result_html
38+
self.diff_html = diff_html
39+
40+
def strip_with_suffix(suffix, files):
41+
filtered = [x for x in files if x.endswith(suffix)]
42+
stripped = [x.replace(suffix, '') for x in filtered]
43+
return set(stripped)
44+
45+
def check_missing(lhs, rhs, rhs_description):
46+
only_lhs = lhs.difference(rhs)
47+
if len(only_lhs) != 0:
48+
is_are = 'is' if len(only_lhs) == 1 else 'are'
49+
file_files = 'file' if len(only_lhs) == 1 else 'files'
50+
s = ('(%s) %s missing %s %s' %
51+
(', '.join(only_lhs), is_are, rhs_description, file_files)
52+
)
53+
raise Exception(s)
54+
55+
def resolve_test_names(files):
56+
orig_names = strip_with_suffix(ORIGINAL_SUFFIX, files)
57+
rdbl_names = strip_with_suffix(READABLE_SUFFIX, files)
58+
check_missing(orig_names, rdbl_names, READABLE_SUFFIX)
59+
check_missing(rdbl_names, orig_names, ORIGINAL_SUFFIX)
60+
return orig_names
61+
62+
def make_path(dir_path, name, suffix):
63+
return os.path.join(dir_path, ''.join([name, suffix]))
64+
65+
def make_readability_test(dir_path, name):
66+
return ReadabilityTest(
67+
dir_path,
68+
name,
69+
make_path(dir_path, name, ORIGINAL_SUFFIX),
70+
make_path(dir_path, name, READABLE_SUFFIX)
71+
)
72+
73+
def load_test_data(test):
74+
orig = open(test.orig_path, 'r').read()
75+
rdbl = open(test.rdbl_path, 'r').read()
76+
return ReadabilityTestData(test, orig, rdbl)
77+
78+
def load_readability_tests(dir_path, files):
79+
names = resolve_test_names(files)
80+
return [make_readability_test(dir_path, name) for name in names]
81+
82+
def execute_test(test_data):
83+
doc = readability.Document(test_data.orig_html)
84+
summary = doc.summary()
85+
benchmark_doc = (test_data.rdbl_html, 'benchmark')
86+
result_doc = (summary.html, 'result')
87+
# diff = lxml.html.diff.html_annotate([benchmark_doc, result_doc])
88+
diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
89+
# diff = test_data.orig_html
90+
return ReadabilityTestResult(test_data, summary.html, diff)
91+
92+
DIFF_CSS = '''
93+
#article {
94+
margin: 0 auto;
95+
max-width: 705px;
96+
min-width: 225px;
97+
font-family: Georgia, 'Times New Roman', serif;
98+
font-size: 19px;
99+
line-height: 29px;
100+
}
101+
102+
#article p {
103+
font-size: 19px;
104+
line-height: 29px;
105+
margin: 19px 0px 19px 0px;
106+
}
107+
108+
ins {
109+
background-color: #C6F7C3;
110+
text-decoration: none;
111+
}
112+
113+
ins img {
114+
border-width: 3px;
115+
border-style: dotted;
116+
border-color: #51B548;
117+
}
118+
119+
del {
120+
background-color: #F7C3C3;
121+
text-decoration: none;
122+
}
123+
124+
del img {
125+
border-width: 3px;
126+
border-style: dotted;
127+
border-color: #D12626;
128+
}
129+
'''
130+
131+
def add_diff_css(doc):
132+
style = B.STYLE(DIFF_CSS, type = 'text/css')
133+
head = B.HEAD(style)
134+
doc.insert(0, head)
135+
136+
def write_result(output_dir_path, result):
137+
doc = lxml.html.document_fromstring(result.diff_html)
138+
add_diff_css(doc)
139+
output_file = ''.join([result.test_data.test.name, RESULT_SUFFIX])
140+
output_path = os.path.join(output_dir_path, output_file)
141+
with open(output_path, 'w') as f:
142+
f.write(lxml.html.tostring(doc))
143+
144+
def run_readability_tests():
145+
files = os.listdir(TEST_DATA_PATH)
146+
tests = load_readability_tests(TEST_DATA_PATH, files)
147+
for test in tests:
148+
test_data = load_test_data(test)
149+
result = execute_test(test_data)
150+
write_result(TEST_OUTPUT_PATH, result)
151+
152+
class TestStripWithSuffix(unittest.TestCase):
153+
154+
def test_no_files(self):
155+
expected = set()
156+
actual = strip_with_suffix('.test', [])
157+
self.assertEqual(expected, actual)
158+
159+
def test_files(self):
160+
expected = {'foo', 'bar'}
161+
actual = strip_with_suffix('.test', ['foo.test', 'bar.test'])
162+
self.assertEqual(expected, actual)
163+
164+
def test_extra_files(self):
165+
expected = {'foo', 'bar'}
166+
actual = strip_with_suffix('.test', ['foo.test', 'bar.test', 'extra'])
167+
self.assertEqual(expected, actual)
168+
169+
class TestResolveTestNames(unittest.TestCase):
170+
171+
def test_no_files(self):
172+
expected = set()
173+
actual = resolve_test_names([])
174+
self.assertEqual(expected, actual)
175+
176+
def test_files(self):
177+
expected = {'foo', 'bar'}
178+
files = [
179+
'foo-orig.html',
180+
'foo-rdbl.html',
181+
'bar-orig.html',
182+
'bar-rdbl.html'
183+
]
184+
actual = resolve_test_names(files)
185+
self.assertEqual(expected, actual)
186+
187+
def test_missing_rdbl(self):
188+
files = [
189+
'foo-orig.html',
190+
'foo-rdbl.html',
191+
'bar-orig.html'
192+
]
193+
with self.assertRaisesRegexp(
194+
Exception,
195+
r'\(bar\) is missing -rdbl.html file'
196+
):
197+
resolve_test_names(files)
198+
199+
def test_missing_multiple_rdbl(self):
200+
files = [
201+
'foo-orig.html',
202+
'bar-orig.html'
203+
]
204+
with self.assertRaisesRegexp(
205+
Exception,
206+
r'\(foo, bar\) are missing -rdbl.html files'
207+
):
208+
resolve_test_names(files)
209+
210+
def test_missing_orig(self):
211+
files = [
212+
'foo-orig.html',
213+
'foo-rdbl.html',
214+
'bar-rdbl.html'
215+
]
216+
with self.assertRaisesRegexp(
217+
Exception,
218+
r'\(bar\) is missing -orig.html file'
219+
):
220+
resolve_test_names(files)
221+
222+
def test_missing_multiple_orig(self):
223+
files = [
224+
'foo-rdbl.html',
225+
'bar-rdbl.html'
226+
]
227+
with self.assertRaisesRegexp(
228+
Exception,
229+
r'\(foo, bar\) are missing -orig.html files'
230+
):
231+
resolve_test_names(files)
232+
233+
def main():
234+
if len(sys.argv) > 1 and sys.argv[1] == 'unittest':
235+
del sys.argv[1]
236+
return unittest.main()
237+
run_readability_tests()
238+
239+
if __name__ == '__main__':
240+
main()

test_output/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*
2+
!.gitignore

0 commit comments

Comments
 (0)