Merge Jerry: pull in the ability to get back confidence score as well as the processed html

jcharum · mitechie · commit b78d7e85012e · 2012-04-20T20:49:07.000-04:00
diff --git a/README.rst b/README.rst
@@ -54,6 +54,16 @@ As a Library
     readable_article = Document(html).summary()
     readable_title = Document(html).short_title()
 
+You can also use the `get_summary_with_metadata` method to get back other
+metadata such as the confidence score found while processing the input.
+
+::
+
+    doc = Document(html).get_summary_with_metadata()
+    print doc.html
+    print doc.confidence
+
+
 Optional `Document` keyword argument:
 
 - attributes:
diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py
@@ -4,6 +4,7 @@
 import sys
 
 from collections import defaultdict
+from collections import namedtuple
 from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
@@ -87,6 +88,11 @@ def text_length(i):
     return len(clean(i.text_content() or ""))
 
 
+# We want to change over the Summary to a nametuple to be more memory
+# effecient and because it doesn't need to be mutable.
+Summary = namedtuple('Summary', ['html', 'confidence'])
+
+
 class Document:
     """Class to build a etree document out of html."""
     TEXT_LENGTH_THRESHOLD = 25
@@ -139,11 +145,33 @@ def title(self):
     def short_title(self):
         return shorten_title(self.html)
 
+    def get_summary_with_metadata(self, enclose_with_html_tag=True):
+        """Parse the input content and return a Summary object
+
+        :param enclose_with_html_tag: Bool do you want a full <html> document
+        or just the <div> html partial.
+
+        """
+        summary = self._summary(enclose_with_html_tag=enclose_with_html_tag)
+        # For this call return the raw Summary object.
+        return summary
+
     def summary(self, enclose_with_html_tag=True):
-        """Generate the summary of the html docuemnt
+        """Generate the summary of the html document
+
+        :param enclose_with_html_tag: Bool do you want a full <html> document
+        or just the <div> html partial.
+
+        """
+        summary = self._summary(enclose_with_html_tag=enclose_with_html_tag)
+        # Only return the html to be consistent with the backwards api.
+        return summary.html
+
+    def _summary(self, enclose_with_html_tag=True):
+        """Helper used in a few places to generate the summary of the content
 
-        :param enclose_with_html_tag: return only the div of the document,
-        don't wrap in html and body tags.
+        :param enclose_with_html_tag: Bool do you want a full <html> document
+        or just the <div> html partial.
 
         """
         try:
@@ -162,6 +190,7 @@ def summary(self, enclose_with_html_tag=True):
                 best_candidate = self.select_best_candidate(candidates)
 
                 if best_candidate:
+                    confidence = best_candidate['content_score']
                     article = self.get_article(candidates, best_candidate,
                             enclose_with_html_tag=enclose_with_html_tag)
                 else:
@@ -177,7 +206,8 @@ def summary(self, enclose_with_html_tag=True):
                         log.debug(
                             ("Ruthless and lenient parsing did not work. "
                              "Returning raw html"))
-                        article = self.html.find('body')
+                        article = self.html.find('body') or self.html
+                        confidence = 0
                         if article is None:
                             article = self.html
                 cleaned_article = self.sanitize(article, candidates)
@@ -191,7 +221,7 @@ def summary(self, enclose_with_html_tag=True):
                     # Loop through and try again.
                     continue
                 else:
-                    return cleaned_article
+                    return Summary(confidence=confidence, html=cleaned_article)
         except StandardError, e:
             log.exception('error getting summary: ')
             raise Unparseable(str(e)), None, sys.exc_info()[2]
diff --git a/src/tests/test_article_only.py b/src/tests/test_article_only.py
@@ -36,3 +36,15 @@ def test_si_sample_html_partial(self):
         doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
         res = doc.summary(enclose_with_html_tag=False)
         self.assertEqual('<div><div class="', res[0:17])
+
+    def test_si_sample_full_summary(self):
+        """We should parse the doc and get a full summary with confidence"""
+        sample = load_sample('si-game.sample.html')
+        doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
+        res = doc.get_summary_with_metadata(enclose_with_html_tag=False)
+        self.assertTrue(hasattr(res, 'html'), 'res should have an html attrib')
+        self.assertTrue(hasattr(res, 'confidence'), 'res should have an html attrib')
+        self.assertEqual('<div><div class="', res.html[0:17])
+        self.assertTrue(res.confidence > 50,
+            'The confidence score should be larger than 50: ' + str(res.confidence))
+