2222
2323
2424REGEXES = {
25- 'unlikelyCandidatesRe' : re .compile ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter' , re .I ),
26- 'okMaybeItsACandidateRe' : re .compile ('and|article|body|column|main|shadow' , re .I ),
27- 'positiveRe' : re .compile ('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story' , re .I ),
28- 'negativeRe' : re .compile ('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget' , re .I ),
29- 'divToPElementsRe' : re .compile ('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)' , re .I ),
25+ 'unlikelyCandidatesRe' : re .compile (
26+ ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|'
27+ 'shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|'
28+ 'tweet|twitter' ), re .I ),
29+ 'okMaybeItsACandidateRe' : re .compile (
30+ 'and|article|body|column|main|shadow' , re .I ),
31+ 'positiveRe' : re .compile (
32+ ('article|body|content|entry|hentry|main|page|pagination|post|text|'
33+ 'blog|story' ), re .I ),
34+ 'negativeRe' : re .compile (
35+ ('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|'
36+ 'outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|'
37+ 'tool|widget' ), re .I ),
38+ 'divToPElementsRe' : re .compile (
39+ '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)' , re .I ),
3040 #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
3141 #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
3242 #'trimRe': re.compile('^\s+|\s+$/'),
3343 #'normalizeRe': re.compile('\s{2,}/'),
3444 #'killBreaksRe': re.compile('(<br\s*\/?>(\s| ?)*){1,}/'),
3545 #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
36- #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
46+ #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
3747}
3848
3949
@@ -132,8 +142,8 @@ def short_title(self):
132142 def summary (self , enclose_with_html_tag = False ):
133143 """Generate the summary of the html docuemnt
134144
135- :param enclose_with_html_tag: return only the div of the document, don't wrap
136- in html and body tags.
145+ :param enclose_with_html_tag: return only the div of the document,
146+ don't wrap in html and body tags.
137147
138148 """
139149 try :
@@ -187,7 +197,8 @@ def summary(self, enclose_with_html_tag=False):
187197 log .exception ('error getting summary: ' )
188198 raise Unparseable (str (e )), None , sys .exc_info ()[2 ]
189199
190- def get_article (self , candidates , best_candidate , enclose_with_html_tag = False ):
200+ def get_article (self , candidates , best_candidate ,
201+ enclose_with_html_tag = False ):
191202 # Now that we have the top candidate, look through its siblings for
192203 # content that might also be related.
193204 # Things like preambles, content split by ads that we removed, etc.
@@ -235,7 +246,9 @@ def get_article(self, candidates, best_candidate, enclose_with_html_tag=False):
235246 return output
236247
237248 def select_best_candidate (self , candidates ):
238- sorted_candidates = sorted (candidates .values (), key = lambda x : x ['content_score' ], reverse = True )
249+ sorted_candidates = sorted (candidates .values (),
250+ key = lambda x : x ['content_score' ],
251+ reverse = True )
239252 for candidate in sorted_candidates [:5 ]:
240253 elem = candidate ['elem' ]
241254 self .debug ("Top 5 : %6.3f %s" % (
@@ -466,7 +479,8 @@ def sanitize(self, node, candidates):
466479 reason = "less than 3x <p>s than <input>s"
467480 to_remove = True
468481 elif content_length < (MIN_LEN ) and (counts ["img" ] == 0 or counts ["img" ] > 2 ):
469- reason = "too short content length %s without a single image" % content_length
482+ reason = ('too short content length %s without a single'
483+ ' image' ) % content_length
470484 to_remove = True
471485 elif weight < 25 and link_density > 0.2 :
472486 reason = "too many links %.3f for its weight %s" % (
@@ -477,44 +491,34 @@ def sanitize(self, node, candidates):
477491 link_density , weight )
478492 to_remove = True
479493 elif (counts ["embed" ] == 1 and content_length < 75 ) or counts ["embed" ] > 1 :
480- reason = "<embed>s with too short content length, or too many <embed>s"
494+ reason = ('<embed>s with too short content length, or too'
495+ ' many <embed>s' )
481496 to_remove = True
482- # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
483- # imgs = el.findall('.//img')
484- # valid_img = False
485- # self.debug(tounicode(el))
486- # for img in imgs:
487- #
488- # height = img.get('height')
489- # text_length = img.get('text_length')
490- # self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
491- # if to_int(height) >= 100 or to_int(text_length) >= 100:
492- # valid_img = True
493- # self.debug("valid image" + tounicode(img))
494- # break
495- # if valid_img:
496- # to_remove = False
497- # self.debug("Allowing %s" %el.text_content())
498- # for desnode in self.tags(el, "table", "ul", "div"):
499- # allowed[desnode] = True
500-
501- #find x non empty preceding and succeeding siblings
497+
498+ # don't really understand what this is doing. Originally
499+ # the i/j were =+ which sets the value to 1. I think that
500+ # was supposed to be += which would increment. But then
501+ # it's compared to x which is hard set to 1. So you only
502+ # ever do one loop in each iteration and don't understand
503+ # it. Will have to investigate when we get to testing more
504+ # pages.
502505 i , j = 0 , 0
503506 x = 1
507+
504508 siblings = []
505509 for sib in el .itersiblings ():
506510 #self.debug(sib.text_content())
507511 sib_content_length = text_length (sib )
508512 if sib_content_length :
509- i = + 1
513+ i += 1
510514 siblings .append (sib_content_length )
511515 if i == x :
512516 break
513517 for sib in el .itersiblings (preceding = True ):
514518 #self.debug(sib.text_content())
515519 sib_content_length = text_length (sib )
516520 if sib_content_length :
517- j = + 1
521+ j += 1
518522 siblings .append (sib_content_length )
519523 if j == x :
520524 break
@@ -526,7 +530,8 @@ def sanitize(self, node, candidates):
526530 allowed [desnode ] = True
527531
528532 if to_remove :
529- self .debug ("Cleaned %6.3f %s with weight %s cause it has %s." %
533+ self .debug (
534+ "Cleaned %6.3f %s with weight %s cause it has %s." %
530535 (content_score , describe (el ), weight , reason ))
531536 #print tounicode(el)
532537 #self.debug("pname %s pweight %.3f" %(pname, pweight))
0 commit comments