Skip to content

Commit 62df355

Browse files
jcharummitechie
authored andcommitted
Checkpoint of multi-page article work
This implements some basic tools needed by the multi-page article algorithm. Conflicts: src/readability_lxml/readability.py
1 parent 29fceeb commit 62df355

File tree

2 files changed

+130
-9
lines changed

2 files changed

+130
-9
lines changed

readability/urlfetch.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import urllib2
2+
3+
class UrlFetch():
4+
"""
5+
A class for fetching URLs. This provides a layer of abstraction that can
6+
be easily replaced for testing.
7+
"""
8+
9+
def urlread(self, url):
10+
return urllib2.urlopen(url).read()
11+
12+
13+
class MockUrlFetch(UrlFetch):
14+
15+
def __init__(self, urldict):
16+
self._urldict = urldict
17+
18+
def urlread(self, url):
19+
path = self._urldict[url]
20+
with open(path, 'r') as f:
21+
return f.read()

src/readability_lxml/readability.py

Lines changed: 109 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import re
44
import sys
5+
import urlparse
56

67
from collections import defaultdict
78
from collections import namedtuple
@@ -93,7 +94,93 @@ def text_length(i):
9394
return len(clean(i.text_content() or ""))
9495

9596

97+
def clean_segment_extension(num_segments, index, segment):
98+
if segment.find('.') == -1:
99+
return segment
100+
else:
101+
split_segment = segment.split('.')
102+
possible_type = split_segment[1]
103+
has_non_alpha = re.search(r'[^a-zA-Z]', possible_type)
104+
if has_non_alpha:
105+
return segment
106+
else:
107+
return split_segment[0]
108+
109+
110+
def clean_segment_ewcms(num_segments, index, segment):
111+
"""
112+
EW-CMS specific segment cleaning. Quoth the original source:
113+
"EW-CMS specific segment replacement. Ugly.
114+
Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html"
115+
"""
116+
return segment.replace(',00', '')
117+
96118

119+
def clean_segment_page_number(num_segments, index, segment):
120+
# If our first or second segment has anything looking like a page number,
121+
# remove it.
122+
if index >= (num_segments - 2):
123+
pattern = r'((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$'
124+
cleaned = re.sub(pattern, '', segment, re.IGNORECASE)
125+
if cleaned == '':
126+
return None
127+
else:
128+
return cleaned
129+
else:
130+
return segment
131+
132+
133+
def clean_segment_number(num_segments, index, segment):
134+
# If this is purely a number, and it's the first or second segment, it's
135+
# probably a page number. Remove it.
136+
if index >= (num_segments - 2) and re.search(r'^\d{1,2}$', segment):
137+
return None
138+
else:
139+
return segment
140+
141+
142+
def clean_segment(num_segments, index, segment):
143+
"""
144+
Cleans a single segment of a URL to find the base URL. The base URL is as
145+
a reference when evaluating URLs that might be next-page links. Returns a
146+
cleaned segment string or None, if the segment should be omitted entirely
147+
from the base URL.
148+
"""
149+
funcs = [
150+
clean_segment_extension,
151+
clean_segment_ewcms,
152+
clean_segment_page_number,
153+
clean_segment_number
154+
]
155+
cleaned_segment = segment
156+
for func in funcs:
157+
if cleaned_segment is None:
158+
break
159+
cleaned_segment = func(num_segments, index, cleaned_segment)
160+
return cleaned_segment
161+
162+
163+
def filter_none(seq):
164+
return [x for x in seq if x is not None]
165+
166+
167+
def clean_segments(segments):
168+
cleaned = [
169+
clean_segment(len(segments), i, s)
170+
for i, s in enumerate(segments)
171+
]
172+
return filter_none(cleaned)
173+
174+
175+
def find_base_url(url):
176+
if url is None:
177+
return None
178+
parts = urlparse.urlsplit(url)
179+
segments = parts.path.split('/')
180+
cleaned_segments = clean_segments(segments)
181+
new_path = '/'.join(cleaned_segments)
182+
new_parts = (parts.scheme, parts.netloc, new_path, '', '')
183+
return urlparse.urlunsplit(new_parts)
97184

98185

99186
class Document:
@@ -254,9 +341,21 @@ def get_article(self, candidates, best_candidate,
254341
append = True
255342
sibling_key = sibling # HashableElement(sibling)
256343
if sibling_key in candidates:
344+
# Print out sibling information for debugging.
345+
sibling_candidate = candidates[sibling_key]
346+
self.debug(
347+
"Sibling: %6.3f %s" %
348+
(sibling_candidate['content_score'], describe(sibling))
349+
)
350+
257351
sib_threshhold = sibling_score_threshold
258352
if candidates[sibling_key]['content_score'] >= sib_threshhold:
259353
append = True
354+
else:
355+
self.debug("Sibling: %s" % describe(sibling))
356+
357+
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
358+
append = True
260359

261360
if sibling.tag == "p":
262361
link_density = self.get_link_density(sibling)
@@ -314,6 +413,7 @@ def score_paragraphs(self, ):
314413
candidates = {}
315414
ordered = []
316415
for elem in self.tags(self.html, "p", "pre", "td"):
416+
self.debug('Scoring %s' % describe(elem))
317417
parent_node = elem.getparent()
318418
if parent_node is None:
319419
continue
@@ -418,15 +518,9 @@ def remove_unlikely_candidates(self):
418518

419519
def transform_misused_divs_into_paragraphs(self):
420520
for elem in self.tags(self.html, 'div'):
421-
# transform <div>s that do not contain other block elements into
422-
# <p>s
423-
#FIXME: The current implementation ignores all descendants that
424-
# are not direct children of elem
425-
# This results in incorrect results in case there is an <img>
426-
# buried within an <a> for example
427-
if not REGEXES['divToPElementsRe'].search(
428-
unicode(''.join(map(tostring, list(elem))))):
429-
#self.debug("Altering %s to p" % (describe(elem)))
521+
# transform <div>s that do not contain other block elements into <p>s
522+
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
523+
self.debug("Altering %s to p" % (describe(elem)))
430524
elem.tag = "p"
431525
#print "Fixed element "+describe(elem)
432526

@@ -436,6 +530,7 @@ def transform_misused_divs_into_paragraphs(self):
436530
p.text = elem.text
437531
elem.text = None
438532
elem.insert(0, p)
533+
self.debug("Appended %s to %s" % (tounicode(p), describe(elem)))
439534
#print "Appended "+tounicode(p)+" to "+describe(elem)
440535

441536
for pos, child in reversed(list(enumerate(elem))):
@@ -444,11 +539,16 @@ def transform_misused_divs_into_paragraphs(self):
444539
p.text = child.tail
445540
child.tail = None
446541
elem.insert(pos + 1, p)
542+
self.debug("Inserted %s to %s" % (tounicode(p), describe(elem)))
447543
#print "Inserted "+tounicode(p)+" to "+describe(elem)
448544
if child.tag == 'br':
449545
#print 'Dropped <br> at '+describe(elem)
450546
child.drop_tree()
451547

548+
def findNextPageLink(self, elem):
549+
allLinks = self.tags(elem, ['a'])
550+
baseUrl = self.find_base_url(self.options['url'])
551+
452552
def tags(self, node, *tag_names):
453553
for tag_name in tag_names:
454554
for e in node.findall('.//%s' % tag_name):

0 commit comments

Comments
 (0)