Skip to content

Commit 6abc6f7

Browse files
jcharummitechie
authored andcommitted
Add cleaning of short segments
Conflicts: src/readability_lxml/readability.py
1 parent 1e30e33 commit 6abc6f7

File tree

1 file changed

+28
-13
lines changed

1 file changed

+28
-13
lines changed

src/readability_lxml/readability.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def text_length(i):
9494
return len(clean(i.text_content() or ""))
9595

9696

97-
def clean_segment_extension(num_segments, index, segment):
97+
def clean_segment_extension(segments, index, segment):
9898
if segment.find('.') == -1:
9999
return segment
100100
else:
@@ -107,7 +107,7 @@ def clean_segment_extension(num_segments, index, segment):
107107
return split_segment[0]
108108

109109

110-
def clean_segment_ewcms(num_segments, index, segment):
110+
def clean_segment_ewcms(segments, index, segment):
111111
"""
112112
EW-CMS specific segment cleaning. Quoth the original source:
113113
"EW-CMS specific segment replacement. Ugly.
@@ -116,10 +116,10 @@ def clean_segment_ewcms(num_segments, index, segment):
116116
return segment.replace(',00', '')
117117

118118

119-
def clean_segment_page_number(num_segments, index, segment):
119+
def clean_segment_page_number(segments, index, segment):
120120
# If our first or second segment has anything looking like a page number,
121121
# remove it.
122-
if index >= (num_segments - 2):
122+
if index >= (len(segments) - 2):
123123
pattern = r'((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$'
124124
cleaned = re.sub(pattern, '', segment, re.IGNORECASE)
125125
if cleaned == '':
@@ -130,23 +130,37 @@ def clean_segment_page_number(num_segments, index, segment):
130130
return segment
131131

132132

133-
def clean_segment_number(num_segments, index, segment):
133+
def clean_segment_number(segments, index, segment):
134134
# If this is purely a number, and it's the first or second segment, it's
135135
# probably a page number. Remove it.
136-
if index >= (num_segments - 2) and re.search(r'^\d{1,2}$', segment):
136+
if index >= (len(segments) - 2) and re.search(r'^\d{1,2}$', segment):
137137
return None
138138
else:
139139
return segment
140140

141-
142-
def clean_segment_index(num_segments, index, segment):
143-
if index == (num_segments - 1) and segment.lower() == 'index':
141+
def clean_segment_index(segments, index, segment):
142+
if index == (len(segments) - 1) and segment.lower() == 'index':
144143
return None
145144
else:
146145
return segment
147146

148147

149-
def clean_segment(num_segments, index, segment):
148+
def clean_segment_short(segments, index, segment):
149+
# It is not clear to me what this is accomplishing. The original
150+
# readability source just says:
151+
#
152+
# "If our first or second segment is smaller than 3 characters, and the
153+
# first segment was purely alphas, remove it."
154+
#
155+
# However, the code actually checks to make sure that there are no alphas
156+
# in the segment, rather than checking for purely alphas.
157+
alphas = re.search(r'[a-z]', segments[-1], re.IGNORECASE)
158+
if index >= (len(segments) - 2) and len(segment) < 3 and not alphas:
159+
return None
160+
else:
161+
return segment
162+
163+
def clean_segment(segments, index, segment):
150164
"""
151165
Cleans a single segment of a URL to find the base URL. The base URL is as
152166
a reference when evaluating URLs that might be next-page links. Returns a
@@ -158,13 +172,14 @@ def clean_segment(num_segments, index, segment):
158172
clean_segment_ewcms,
159173
clean_segment_page_number,
160174
clean_segment_number,
161-
clean_segment_index
175+
clean_segment_index,
176+
clean_segment_short
162177
]
163178
cleaned_segment = segment
164179
for func in funcs:
165180
if cleaned_segment is None:
166181
break
167-
cleaned_segment = func(num_segments, index, cleaned_segment)
182+
cleaned_segment = func(segments, index, cleaned_segment)
168183
return cleaned_segment
169184

170185

@@ -174,7 +189,7 @@ def filter_none(seq):
174189

175190
def clean_segments(segments):
176191
cleaned = [
177-
clean_segment(len(segments), i, s)
192+
clean_segment(segments, i, s)
178193
for i, s in enumerate(segments)
179194
]
180195
return filter_none(cleaned)

0 commit comments

Comments
 (0)