@@ -217,9 +217,10 @@ def get_article(self, candidates, best_candidate,
217217 if sibling is best_elem :
218218 append = True
219219 sibling_key = sibling # HashableElement(sibling)
220- if sibling_key in candidates and \
221- candidates [sibling_key ]['content_score' ] >= sibling_score_threshold :
222- append = True
220+ if sibling_key in candidates :
221+ sib_threshhold = sibling_score_threshold
222+ if candidates [sibling_key ]['content_score' ] >= sib_threshhold :
223+ append = True
223224
224225 if sibling .tag == "p" :
225226 link_density = self .get_link_density (sibling )
@@ -294,10 +295,11 @@ def score_paragraphs(self, ):
294295 candidates [parent_node ] = self .score_node (parent_node )
295296 ordered .append (parent_node )
296297
297- if grand_parent_node is not None and grand_parent_node not in candidates :
298- candidates [grand_parent_node ] = self .score_node (
299- grand_parent_node )
300- ordered .append (grand_parent_node )
298+ if grand_parent_node is not None :
299+ if grand_parent_node not in candidates :
300+ candidates [grand_parent_node ] = self .score_node (
301+ grand_parent_node )
302+ ordered .append (grand_parent_node )
301303
302304 content_score = 1
303305 content_score += len (inner_text .split (',' ))
@@ -308,7 +310,8 @@ def score_paragraphs(self, ):
308310 #WTF? candidates[elem]['content_score'] += content_score
309311 candidates [parent_node ]['content_score' ] += content_score
310312 if grand_parent_node is not None :
311- candidates [grand_parent_node ]['content_score' ] += content_score / 2.0
313+ add_to_score = content_score / 2.0
314+ candidates [grand_parent_node ]['content_score' ] += add_to_score
312315
313316 # Scale the final candidates score based on link density. Good content
314317 # should have a relatively small link density (5% or less) and be
@@ -370,9 +373,12 @@ def remove_unlikely_candidates(self):
370373 if len (s ) < 2 :
371374 continue
372375 #self.debug(s)
373- if REGEXES ['unlikelyCandidatesRe' ].search (s ) and (not REGEXES ['okMaybeItsACandidateRe' ].search (s )) and elem .tag not in ['html' , 'body' ]:
374- self .debug ("Removing unlikely candidate - %s" % describe (elem ))
375- elem .drop_tree ()
376+ if REGEXES ['unlikelyCandidatesRe' ].search (s ):
377+ if not REGEXES ['okMaybeItsACandidateRe' ].search (s ):
378+ if elem .tag not in ['html' , 'body' ]:
379+ self .debug ("Removing unlikely candidate - %s" %
380+ describe (elem ))
381+ elem .drop_tree ()
376382
377383 def transform_misused_divs_into_paragraphs (self ):
378384 for elem in self .tags (self .html , 'div' ):
@@ -421,7 +427,9 @@ def sanitize(self, node, candidates):
421427 MIN_LEN = self .options .get ('min_text_length' ,
422428 self .TEXT_LENGTH_THRESHOLD )
423429 for header in self .tags (node , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" ):
424- if self .class_weight (header ) < 0 or self .get_link_density (header ) > 0.33 :
430+ class_weight = self .class_weight (header )
431+ link_density = self .get_link_density (header )
432+ if class_weight < 0 or link_density > 0.33 :
425433 header .drop_tree ()
426434
427435 for elem in self .tags (node , "form" , "iframe" , "textarea" ):
@@ -455,7 +463,8 @@ def sanitize(self, node, candidates):
455463 parent_node = el .getparent ()
456464 if parent_node is not None :
457465 if parent_node in candidates :
458- content_score = candidates [parent_node ]['content_score' ]
466+ parent = candidates [parent_node ]
467+ content_score = parent ['content_score' ]
459468 else :
460469 content_score = 0
461470 #if parent_node is not None:
0 commit comments