Skip to content

Commit b78d7e8

Browse files
jcharummitechie
authored andcommitted
Merge Jerry: pull in the ability to get back confidence score as well as the processed html
1 parent a2b17e7 commit b78d7e8

3 files changed

Lines changed: 57 additions & 5 deletions

File tree

README.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,16 @@ As a Library
5454
readable_article = Document(html).summary()
5555
readable_title = Document(html).short_title()
5656

57+
You can also use the `get_summary_with_metadata` method to get back other
58+
metadata such as the confidence score found while processing the input.
59+
60+
::
61+
62+
doc = Document(html).get_summary_with_metadata()
63+
print doc.html
64+
print doc.confidence
65+
66+
5767
Optional `Document` keyword argument:
5868

5969
- attributes:

src/readability_lxml/readability.py

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import sys
55

66
from collections import defaultdict
7+
from collections import namedtuple
78
from lxml.etree import tostring
89
from lxml.etree import tounicode
910
from lxml.html import document_fromstring
@@ -87,6 +88,11 @@ def text_length(i):
8788
return len(clean(i.text_content() or ""))
8889

8990

91+
# We want to change over the Summary to a nametuple to be more memory
92+
# effecient and because it doesn't need to be mutable.
93+
Summary = namedtuple('Summary', ['html', 'confidence'])
94+
95+
9096
class Document:
9197
"""Class to build a etree document out of html."""
9298
TEXT_LENGTH_THRESHOLD = 25
@@ -139,11 +145,33 @@ def title(self):
139145
def short_title(self):
140146
return shorten_title(self.html)
141147

148+
def get_summary_with_metadata(self, enclose_with_html_tag=True):
149+
"""Parse the input content and return a Summary object
150+
151+
:param enclose_with_html_tag: Bool do you want a full <html> document
152+
or just the <div> html partial.
153+
154+
"""
155+
summary = self._summary(enclose_with_html_tag=enclose_with_html_tag)
156+
# For this call return the raw Summary object.
157+
return summary
158+
142159
def summary(self, enclose_with_html_tag=True):
143-
"""Generate the summary of the html docuemnt
160+
"""Generate the summary of the html document
161+
162+
:param enclose_with_html_tag: Bool do you want a full <html> document
163+
or just the <div> html partial.
164+
165+
"""
166+
summary = self._summary(enclose_with_html_tag=enclose_with_html_tag)
167+
# Only return the html to be consistent with the backwards api.
168+
return summary.html
169+
170+
def _summary(self, enclose_with_html_tag=True):
171+
"""Helper used in a few places to generate the summary of the content
144172
145-
:param enclose_with_html_tag: return only the div of the document,
146-
don't wrap in html and body tags.
173+
:param enclose_with_html_tag: Bool do you want a full <html> document
174+
or just the <div> html partial.
147175
148176
"""
149177
try:
@@ -162,6 +190,7 @@ def summary(self, enclose_with_html_tag=True):
162190
best_candidate = self.select_best_candidate(candidates)
163191

164192
if best_candidate:
193+
confidence = best_candidate['content_score']
165194
article = self.get_article(candidates, best_candidate,
166195
enclose_with_html_tag=enclose_with_html_tag)
167196
else:
@@ -177,7 +206,8 @@ def summary(self, enclose_with_html_tag=True):
177206
log.debug(
178207
("Ruthless and lenient parsing did not work. "
179208
"Returning raw html"))
180-
article = self.html.find('body')
209+
article = self.html.find('body') or self.html
210+
confidence = 0
181211
if article is None:
182212
article = self.html
183213
cleaned_article = self.sanitize(article, candidates)
@@ -191,7 +221,7 @@ def summary(self, enclose_with_html_tag=True):
191221
# Loop through and try again.
192222
continue
193223
else:
194-
return cleaned_article
224+
return Summary(confidence=confidence, html=cleaned_article)
195225
except StandardError, e:
196226
log.exception('error getting summary: ')
197227
raise Unparseable(str(e)), None, sys.exc_info()[2]

src/tests/test_article_only.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,15 @@ def test_si_sample_html_partial(self):
3636
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
3737
res = doc.summary(enclose_with_html_tag=False)
3838
self.assertEqual('<div><div class="', res[0:17])
39+
40+
def test_si_sample_full_summary(self):
41+
"""We should parse the doc and get a full summary with confidence"""
42+
sample = load_sample('si-game.sample.html')
43+
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
44+
res = doc.get_summary_with_metadata(enclose_with_html_tag=False)
45+
self.assertTrue(hasattr(res, 'html'), 'res should have an html attrib')
46+
self.assertTrue(hasattr(res, 'confidence'), 'res should have an html attrib')
47+
self.assertEqual('<div><div class="', res.html[0:17])
48+
self.assertTrue(res.confidence > 50,
49+
'The confidence score should be larger than 50: ' + str(res.confidence))
50+

0 commit comments

Comments
 (0)