@@ -59,6 +59,17 @@ def text_length(i):
5959class Unparseable (ValueError ):
6060 pass
6161
62+ class Summary :
63+ '''
64+ The type of object returned by Document.summary(). This includes the
65+ confidence level we have in our summary. If this is low (<35), our summary
66+ may not be valid, though we did our best.
67+ '''
68+
69+ def __init__ (self , confidence , html ):
70+ self .confidence = confidence
71+ self .html = html
72+
6273class Document :
6374 TEXT_LENGTH_THRESHOLD = 25
6475 RETRY_LENGTH = 250
@@ -111,6 +122,7 @@ def summary(self):
111122
112123 best_candidate = self .select_best_candidate (candidates )
113124 if best_candidate :
125+ confidence = best_candidate ['content_score' ]
114126 article = self .get_article (candidates , best_candidate )
115127 else :
116128 if ruthless :
@@ -121,6 +133,7 @@ def summary(self):
121133 continue
122134 else :
123135 logging .debug ("Ruthless and lenient parsing did not work. Returning raw html" )
136+ confidence = 0 ;
124137 article = self .html .find ('body' ) or self .html
125138
126139 cleaned_article = self .sanitize (article , candidates )
@@ -129,7 +142,7 @@ def summary(self):
129142 ruthless = False
130143 continue # try again
131144 else :
132- return cleaned_article
145+ return Summary ( confidence , cleaned_article )
133146 except StandardError , e :
134147 #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
135148 logging .exception ('error getting summary: ' )
0 commit comments