Skip to content

Commit aa51283

Browse files
committed
work on doing some more pep8 work on things
1 parent a4b6957 commit aa51283

1 file changed

Lines changed: 40 additions & 35 deletions

File tree

src/readability_lxml/readability.py

Lines changed: 40 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,28 @@
2222

2323

2424
REGEXES = {
25-
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
26-
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
27-
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
28-
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
29-
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
25+
'unlikelyCandidatesRe': re.compile(
26+
('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|'
27+
'shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|'
28+
'tweet|twitter'), re.I),
29+
'okMaybeItsACandidateRe': re.compile(
30+
'and|article|body|column|main|shadow', re.I),
31+
'positiveRe': re.compile(
32+
('article|body|content|entry|hentry|main|page|pagination|post|text|'
33+
'blog|story'), re.I),
34+
'negativeRe': re.compile(
35+
('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|'
36+
'outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|'
37+
'tool|widget'), re.I),
38+
'divToPElementsRe': re.compile(
39+
'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
3040
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
3141
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
3242
#'trimRe': re.compile('^\s+|\s+$/'),
3343
#'normalizeRe': re.compile('\s{2,}/'),
3444
#'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
3545
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
36-
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
46+
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
3747
}
3848

3949

@@ -132,8 +142,8 @@ def short_title(self):
132142
def summary(self, enclose_with_html_tag=False):
133143
"""Generate the summary of the html docuemnt
134144
135-
:param enclose_with_html_tag: return only the div of the document, don't wrap
136-
in html and body tags.
145+
:param enclose_with_html_tag: return only the div of the document,
146+
don't wrap in html and body tags.
137147
138148
"""
139149
try:
@@ -187,7 +197,8 @@ def summary(self, enclose_with_html_tag=False):
187197
log.exception('error getting summary: ')
188198
raise Unparseable(str(e)), None, sys.exc_info()[2]
189199

190-
def get_article(self, candidates, best_candidate, enclose_with_html_tag=False):
200+
def get_article(self, candidates, best_candidate,
201+
enclose_with_html_tag=False):
191202
# Now that we have the top candidate, look through its siblings for
192203
# content that might also be related.
193204
# Things like preambles, content split by ads that we removed, etc.
@@ -235,7 +246,9 @@ def get_article(self, candidates, best_candidate, enclose_with_html_tag=False):
235246
return output
236247

237248
def select_best_candidate(self, candidates):
238-
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
249+
sorted_candidates = sorted(candidates.values(),
250+
key=lambda x: x['content_score'],
251+
reverse=True)
239252
for candidate in sorted_candidates[:5]:
240253
elem = candidate['elem']
241254
self.debug("Top 5 : %6.3f %s" % (
@@ -466,7 +479,8 @@ def sanitize(self, node, candidates):
466479
reason = "less than 3x <p>s than <input>s"
467480
to_remove = True
468481
elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
469-
reason = "too short content length %s without a single image" % content_length
482+
reason = ('too short content length %s without a single'
483+
' image') % content_length
470484
to_remove = True
471485
elif weight < 25 and link_density > 0.2:
472486
reason = "too many links %.3f for its weight %s" % (
@@ -477,44 +491,34 @@ def sanitize(self, node, candidates):
477491
link_density, weight)
478492
to_remove = True
479493
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
480-
reason = "<embed>s with too short content length, or too many <embed>s"
494+
reason = ('<embed>s with too short content length, or too'
495+
' many <embed>s')
481496
to_remove = True
482-
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
483-
# imgs = el.findall('.//img')
484-
# valid_img = False
485-
# self.debug(tounicode(el))
486-
# for img in imgs:
487-
#
488-
# height = img.get('height')
489-
# text_length = img.get('text_length')
490-
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
491-
# if to_int(height) >= 100 or to_int(text_length) >= 100:
492-
# valid_img = True
493-
# self.debug("valid image" + tounicode(img))
494-
# break
495-
# if valid_img:
496-
# to_remove = False
497-
# self.debug("Allowing %s" %el.text_content())
498-
# for desnode in self.tags(el, "table", "ul", "div"):
499-
# allowed[desnode] = True
500-
501-
#find x non empty preceding and succeeding siblings
497+
498+
# don't really understand what this is doing. Originally
499+
# the i/j were =+ which sets the value to 1. I think that
500+
# was supposed to be += which would increment. But then
501+
# it's compared to x which is hard set to 1. So you only
502+
# ever do one loop in each iteration and don't understand
503+
# it. Will have to investigate when we get to testing more
504+
# pages.
502505
i, j = 0, 0
503506
x = 1
507+
504508
siblings = []
505509
for sib in el.itersiblings():
506510
#self.debug(sib.text_content())
507511
sib_content_length = text_length(sib)
508512
if sib_content_length:
509-
i =+ 1
513+
i += 1
510514
siblings.append(sib_content_length)
511515
if i == x:
512516
break
513517
for sib in el.itersiblings(preceding=True):
514518
#self.debug(sib.text_content())
515519
sib_content_length = text_length(sib)
516520
if sib_content_length:
517-
j =+ 1
521+
j += 1
518522
siblings.append(sib_content_length)
519523
if j == x:
520524
break
@@ -526,7 +530,8 @@ def sanitize(self, node, candidates):
526530
allowed[desnode] = True
527531

528532
if to_remove:
529-
self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
533+
self.debug(
534+
"Cleaned %6.3f %s with weight %s cause it has %s." %
530535
(content_score, describe(el), weight, reason))
531536
#print tounicode(el)
532537
#self.debug("pname %s pweight %.3f" %(pname, pweight))

0 commit comments

Comments
 (0)