Skip to content

Commit 5a98e2c

Browse files
committed
Correct appending and allow for document only
- Fix the appending of siblings to the correct nested element - Add a document only flag so that you can get a dom tree you can nest yourself without html/body tags.
1 parent edccec5 commit 5a98e2c

File tree

3 files changed

+31
-24
lines changed

3 files changed

+31
-24
lines changed

README

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,8 @@ Usage::
3333
Command-line usage::
3434

3535
python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
36+
37+
38+
Document() kwarg options:
39+
url=xxx will run make_links_absolute()
40+

readability/readability.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ def summary(self, document_only=False):
9898
ruthless = True
9999
while True:
100100
self._html(True)
101-
102101
for i in self.tags(self.html, 'script', 'style'):
103102
i.drop_tree()
104103
for i in self.tags(self.html, 'body'):
@@ -111,7 +110,8 @@ def summary(self, document_only=False):
111110
best_candidate = self.select_best_candidate(candidates)
112111

113112
if best_candidate:
114-
article = self.get_article(candidates, best_candidate)
113+
article = self.get_article(candidates, best_candidate,
114+
document_only=document_only)
115115
else:
116116
if ruthless:
117117
logging.debug("ruthless removal did not work. ")
@@ -136,12 +136,15 @@ def summary(self, document_only=False):
136136
logging.exception('error getting summary: ' )
137137
raise Unparseable(str(e)), None, sys.exc_info()[2]
138138

139-
def get_article(self, candidates, best_candidate):
139+
def get_article(self, candidates, best_candidate, document_only=False):
140140
# Now that we have the top candidate, look through its siblings for content that might also be related.
141141
# Things like preambles, content split by ads that we removed, etc.
142-
143142
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
144-
output = document_fromstring('<div/>')
143+
# create a new html document with a html->body->div
144+
if document_only:
145+
output = fragment_fromstring('<div/>')
146+
else:
147+
output = document_fromstring('<div/>')
145148
best_elem = best_candidate['elem']
146149
for sibling in best_elem.getparent().getchildren():
147150
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@@ -163,7 +166,12 @@ def get_article(self, candidates, best_candidate):
163166
append = True
164167

165168
if append:
166-
output.append(sibling)
169+
# We don't want to append directly to output, but the div
170+
# in html->body->div
171+
if document_only:
172+
output.append(sibling)
173+
else:
174+
output.getchildren()[0].getchildren()[0].append(sibling)
167175
#if output is not None:
168176
# output.append(best_elem)
169177
return output
@@ -454,13 +462,7 @@ def sanitize(self, node, candidates):
454462
if not (self.options['attributes']):
455463
#el.attrib = {} #FIXME:Checkout the effects of disabling this
456464
pass
457-
# There can be two nodes here. We really want to tounicode only one of
458-
# them.
459-
# To start with let's hack it to get the longest tree as our document.
460-
if len(node.getchildren()) > 1:
461-
children = node.getchildren()
462-
sorted_list = sorted(children, key=len, reverse=True)
463-
node = sorted_list[0]
465+
464466
return clean_attributes(tounicode(node))
465467

466468

tests/test_article_only.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,19 @@ class TestArticleOnly(unittest.TestCase):
2121
2222
"""
2323

24-
def setUp(self):
25-
""""""
26-
pass
27-
28-
def tearDown(self):
29-
""""""
30-
pass
31-
3224
def test_si_sample(self):
25+
"""Using the si sample, load article with only opening body element"""
26+
sample = load_sample('si-game.sample.html')
27+
doc = Document(
28+
sample,
29+
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
30+
res = doc.summary()
31+
self.assertEqual('<html><body><div><div class', res[0:27])
32+
33+
def test_si_sample_doc_only(self):
3334
"""Using the si sample, make sure we can get the article alone."""
3435
sample = load_sample('si-game.sample.html')
35-
doc = Document(sample)
36+
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
3637
res = doc.summary(document_only=True)
37-
38-
self.assertEqual('<div class="', res[0:12])
38+
self.assertEqual('<div><div class="', res[0:17])
3939

0 commit comments

Comments
 (0)