Correct appending and allow for document only

mitechie · mitechie · commit 5a98e2c1b85b · 2012-04-16T20:55:13.000-04:00
- Fix the appending of siblings to the correct nested element
- Add a document only flag so that you can get a dom tree you can nest
yourself without html/body tags.
diff --git a/README b/README
@@ -33,3 +33,8 @@ Usage::
 Command-line usage::
 
     python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
+
+
+Document() kwarg options:
+    url=xxx will run make_links_absolute()
+
diff --git a/readability/readability.py b/readability/readability.py
@@ -98,7 +98,6 @@ def summary(self, document_only=False):
             ruthless = True
             while True:
                 self._html(True)
-
                 for i in self.tags(self.html, 'script', 'style'):
                     i.drop_tree()
                 for i in self.tags(self.html, 'body'):
@@ -111,7 +110,8 @@ def summary(self, document_only=False):
                 best_candidate = self.select_best_candidate(candidates)
 
                 if best_candidate:
-                    article = self.get_article(candidates, best_candidate)
+                    article = self.get_article(candidates, best_candidate,
+                            document_only=document_only)
                 else:
                     if ruthless:
                         logging.debug("ruthless removal did not work. ")
@@ -136,12 +136,15 @@ def summary(self, document_only=False):
             logging.exception('error getting summary: ' )
             raise Unparseable(str(e)), None, sys.exc_info()[2]
 
-    def get_article(self, candidates, best_candidate):
+    def get_article(self, candidates, best_candidate, document_only=False):
         # Now that we have the top candidate, look through its siblings for content that might also be related.
         # Things like preambles, content split by ads that we removed, etc.
-
         sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
-        output = document_fromstring('<div/>')
+        # create a new html document with a html->body->div
+        if document_only:
+            output = fragment_fromstring('<div/>')
+        else:
+            output = document_fromstring('<div/>')
         best_elem = best_candidate['elem']
         for sibling in best_elem.getparent().getchildren():
             #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@@ -163,7 +166,12 @@ def get_article(self, candidates, best_candidate):
                     append = True
 
             if append:
-                output.append(sibling)
+                # We don't want to append directly to output, but the div
+                # in html->body->div
+                if document_only:
+                    output.append(sibling)
+                else:
+                    output.getchildren()[0].getchildren()[0].append(sibling)
         #if output is not None:
         #    output.append(best_elem)
         return output
@@ -454,13 +462,7 @@ def sanitize(self, node, candidates):
             if not (self.options['attributes']):
                 #el.attrib = {} #FIXME:Checkout the effects of disabling this
                 pass
-        # There can be two nodes here. We really want to tounicode only one of
-        # them.
-        # To start with let's hack it to get the longest tree as our document.
-        if len(node.getchildren()) > 1:
-            children = node.getchildren()
-            sorted_list = sorted(children, key=len, reverse=True)
-            node = sorted_list[0]
+
         return clean_attributes(tounicode(node))
 
 
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
@@ -21,19 +21,19 @@ class TestArticleOnly(unittest.TestCase):
 
     """
 
-    def setUp(self):
-        """"""
-        pass
-
-    def tearDown(self):
-        """"""
-        pass
-
     def test_si_sample(self):
+        """Using the si sample, load article with only opening body element"""
+        sample = load_sample('si-game.sample.html')
+        doc = Document(
+            sample,
+            url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
+        res = doc.summary()
+        self.assertEqual('<html><body><div><div class', res[0:27])
+
+    def test_si_sample_doc_only(self):
         """Using the si sample, make sure we can get the article alone."""
         sample = load_sample('si-game.sample.html')
-        doc = Document(sample)
+        doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
         res = doc.summary(document_only=True)
-
-        self.assertEqual('<div class="', res[0:12])
+        self.assertEqual('<div><div class="', res[0:17])