Skip to content

Commit cdd30f6

Browse files
committed
Return confidence level when retieving summary
1 parent 7aac0f0 commit cdd30f6

1 file changed

Lines changed: 14 additions & 1 deletion

File tree

readability/readability.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,17 @@ def text_length(i):
5959
class Unparseable(ValueError):
6060
pass
6161

62+
class Summary:
63+
'''
64+
The type of object returned by Document.summary(). This includes the
65+
confidence level we have in our summary. If this is low (<35), our summary
66+
may not be valid, though we did our best.
67+
'''
68+
69+
def __init__(self, confidence, html):
70+
self.confidence = confidence
71+
self.html = html
72+
6273
class Document:
6374
TEXT_LENGTH_THRESHOLD = 25
6475
RETRY_LENGTH = 250
@@ -111,6 +122,7 @@ def summary(self):
111122

112123
best_candidate = self.select_best_candidate(candidates)
113124
if best_candidate:
125+
confidence = best_candidate['content_score']
114126
article = self.get_article(candidates, best_candidate)
115127
else:
116128
if ruthless:
@@ -121,6 +133,7 @@ def summary(self):
121133
continue
122134
else:
123135
logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
136+
confidence = 0;
124137
article = self.html.find('body') or self.html
125138

126139
cleaned_article = self.sanitize(article, candidates)
@@ -129,7 +142,7 @@ def summary(self):
129142
ruthless = False
130143
continue # try again
131144
else:
132-
return cleaned_article
145+
return Summary(confidence, cleaned_article)
133146
except StandardError, e:
134147
#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
135148
logging.exception('error getting summary: ' )

0 commit comments

Comments
 (0)