@@ -98,7 +98,6 @@ def summary(self, document_only=False):
9898 ruthless = True
9999 while True :
100100 self ._html (True )
101-
102101 for i in self .tags (self .html , 'script' , 'style' ):
103102 i .drop_tree ()
104103 for i in self .tags (self .html , 'body' ):
@@ -111,7 +110,8 @@ def summary(self, document_only=False):
111110 best_candidate = self .select_best_candidate (candidates )
112111
113112 if best_candidate :
114- article = self .get_article (candidates , best_candidate )
113+ article = self .get_article (candidates , best_candidate ,
114+ document_only = document_only )
115115 else :
116116 if ruthless :
117117 logging .debug ("ruthless removal did not work. " )
@@ -136,12 +136,15 @@ def summary(self, document_only=False):
136136 logging .exception ('error getting summary: ' )
137137 raise Unparseable (str (e )), None , sys .exc_info ()[2 ]
138138
139- def get_article (self , candidates , best_candidate ):
139+ def get_article (self , candidates , best_candidate , document_only = False ):
140140 # Now that we have the top candidate, look through its siblings for content that might also be related.
141141 # Things like preambles, content split by ads that we removed, etc.
142-
143142 sibling_score_threshold = max ([10 , best_candidate ['content_score' ] * 0.2 ])
144- output = document_fromstring ('<div/>' )
143+ # create a new html document with a html->body->div
144+ if document_only :
145+ output = fragment_fromstring ('<div/>' )
146+ else :
147+ output = document_fromstring ('<div/>' )
145148 best_elem = best_candidate ['elem' ]
146149 for sibling in best_elem .getparent ().getchildren ():
147150 #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@@ -163,7 +166,12 @@ def get_article(self, candidates, best_candidate):
163166 append = True
164167
165168 if append :
166- output .append (sibling )
169+ # We don't want to append directly to output, but the div
170+ # in html->body->div
171+ if document_only :
172+ output .append (sibling )
173+ else :
174+ output .getchildren ()[0 ].getchildren ()[0 ].append (sibling )
167175 #if output is not None:
168176 # output.append(best_elem)
169177 return output
@@ -454,13 +462,7 @@ def sanitize(self, node, candidates):
454462 if not (self .options ['attributes' ]):
455463 #el.attrib = {} #FIXME:Checkout the effects of disabling this
456464 pass
457- # There can be two nodes here. We really want to tounicode only one of
458- # them.
459- # To start with let's hack it to get the longest tree as our document.
460- if len (node .getchildren ()) > 1 :
461- children = node .getchildren ()
462- sorted_list = sorted (children , key = len , reverse = True )
463- node = sorted_list [0 ]
465+
464466 return clean_attributes (tounicode (node ))
465467
466468
0 commit comments