44import sys
55
66from collections import defaultdict
7+ from collections import namedtuple
78from lxml .etree import tostring
89from lxml .etree import tounicode
910from lxml .html import document_fromstring
@@ -87,6 +88,11 @@ def text_length(i):
8788 return len (clean (i .text_content () or "" ))
8889
8990
91+ # We want to change over the Summary to a nametuple to be more memory
92+ # effecient and because it doesn't need to be mutable.
93+ Summary = namedtuple ('Summary' , ['html' , 'confidence' ])
94+
95+
9096class Document :
9197 """Class to build a etree document out of html."""
9298 TEXT_LENGTH_THRESHOLD = 25
@@ -139,11 +145,33 @@ def title(self):
139145 def short_title (self ):
140146 return shorten_title (self .html )
141147
148+ def get_summary_with_metadata (self , enclose_with_html_tag = True ):
149+ """Parse the input content and return a Summary object
150+
151+ :param enclose_with_html_tag: Bool do you want a full <html> document
152+ or just the <div> html partial.
153+
154+ """
155+ summary = self ._summary (enclose_with_html_tag = enclose_with_html_tag )
156+ # For this call return the raw Summary object.
157+ return summary
158+
142159 def summary (self , enclose_with_html_tag = True ):
143- """Generate the summary of the html docuemnt
160+ """Generate the summary of the html document
161+
162+ :param enclose_with_html_tag: Bool do you want a full <html> document
163+ or just the <div> html partial.
164+
165+ """
166+ summary = self ._summary (enclose_with_html_tag = enclose_with_html_tag )
167+ # Only return the html to be consistent with the backwards api.
168+ return summary .html
169+
170+ def _summary (self , enclose_with_html_tag = True ):
171+ """Helper used in a few places to generate the summary of the content
144172
145- :param enclose_with_html_tag: return only the div of the document,
146- don't wrap in html and body tags .
173+ :param enclose_with_html_tag: Bool do you want a full <html> document
174+ or just the <div> html partial .
147175
148176 """
149177 try :
@@ -162,6 +190,7 @@ def summary(self, enclose_with_html_tag=True):
162190 best_candidate = self .select_best_candidate (candidates )
163191
164192 if best_candidate :
193+ confidence = best_candidate ['content_score' ]
165194 article = self .get_article (candidates , best_candidate ,
166195 enclose_with_html_tag = enclose_with_html_tag )
167196 else :
@@ -177,7 +206,8 @@ def summary(self, enclose_with_html_tag=True):
177206 log .debug (
178207 ("Ruthless and lenient parsing did not work. "
179208 "Returning raw html" ))
180- article = self .html .find ('body' )
209+ article = self .html .find ('body' ) or self .html
210+ confidence = 0
181211 if article is None :
182212 article = self .html
183213 cleaned_article = self .sanitize (article , candidates )
@@ -191,7 +221,7 @@ def summary(self, enclose_with_html_tag=True):
191221 # Loop through and try again.
192222 continue
193223 else :
194- return cleaned_article
224+ return Summary ( confidence = confidence , html = cleaned_article )
195225 except StandardError , e :
196226 log .exception ('error getting summary: ' )
197227 raise Unparseable (str (e )), None , sys .exc_info ()[2 ]
0 commit comments