Skip to content

Commit ab783b2

Browse files
committed
Merge pull request buriy#11 from JanX2/master
Fixing gap in node_length coverage (length=80 was missed) Continue early in remove_unlikely_candidates() in case there is neither a class nor an id attribute. Adding comment about oversight in transform_misused_divs_into_paragraphs
2 parents f9b604c + 3cdc3d6 commit ab783b2

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

readability/readability.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def get_article(self, candidates, best_candidate):
159159

160160
if node_length > 80 and link_density < 0.25:
161161
append = True
162-
elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
162+
elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
163163
append = True
164164

165165
if append:
@@ -280,6 +280,8 @@ def debug(self, *a):
280280
def remove_unlikely_candidates(self):
281281
for elem in self.html.iter():
282282
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
283+
if len(s) < 2:
284+
continue
283285
#self.debug(s)
284286
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
285287
self.debug("Removing unlikely candidate - %s" % describe(elem))
@@ -288,6 +290,8 @@ def remove_unlikely_candidates(self):
288290
def transform_misused_divs_into_paragraphs(self):
289291
for elem in self.tags(self.html, 'div'):
290292
# transform <div>s that do not contain other block elements into <p>s
293+
#FIXME: The current implementation ignores all descendants that are not direct children of elem
294+
# This results in incorrect results in case there is an <img> buried within an <a> for example
291295
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
292296
#self.debug("Altering %s to p" % (describe(elem)))
293297
elem.tag = "p"

0 commit comments

Comments
 (0)