@@ -94,7 +94,7 @@ def text_length(i):
9494 return len (clean (i .text_content () or "" ))
9595
9696
97- def clean_segment_extension (num_segments , index , segment ):
97+ def clean_segment_extension (segments , index , segment ):
9898 if segment .find ('.' ) == - 1 :
9999 return segment
100100 else :
@@ -107,7 +107,7 @@ def clean_segment_extension(num_segments, index, segment):
107107 return split_segment [0 ]
108108
109109
110- def clean_segment_ewcms (num_segments , index , segment ):
110+ def clean_segment_ewcms (segments , index , segment ):
111111 """
112112 EW-CMS specific segment cleaning. Quoth the original source:
113113 "EW-CMS specific segment replacement. Ugly.
@@ -116,10 +116,10 @@ def clean_segment_ewcms(num_segments, index, segment):
116116 return segment .replace (',00' , '' )
117117
118118
119- def clean_segment_page_number (num_segments , index , segment ):
119+ def clean_segment_page_number (segments , index , segment ):
120120 # If our first or second segment has anything looking like a page number,
121121 # remove it.
122- if index >= (num_segments - 2 ):
122+ if index >= (len ( segments ) - 2 ):
123123 pattern = r'((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$'
124124 cleaned = re .sub (pattern , '' , segment , re .IGNORECASE )
125125 if cleaned == '' :
@@ -130,23 +130,37 @@ def clean_segment_page_number(num_segments, index, segment):
130130 return segment
131131
132132
133- def clean_segment_number (num_segments , index , segment ):
133+ def clean_segment_number (segments , index , segment ):
134134 # If this is purely a number, and it's the first or second segment, it's
135135 # probably a page number. Remove it.
136- if index >= (num_segments - 2 ) and re .search (r'^\d{1,2}$' , segment ):
136+ if index >= (len ( segments ) - 2 ) and re .search (r'^\d{1,2}$' , segment ):
137137 return None
138138 else :
139139 return segment
140140
141-
142- def clean_segment_index (num_segments , index , segment ):
143- if index == (num_segments - 1 ) and segment .lower () == 'index' :
141+ def clean_segment_index (segments , index , segment ):
142+ if index == (len (segments ) - 1 ) and segment .lower () == 'index' :
144143 return None
145144 else :
146145 return segment
147146
148147
149- def clean_segment (num_segments , index , segment ):
148+ def clean_segment_short (segments , index , segment ):
149+ # It is not clear to me what this is accomplishing. The original
150+ # readability source just says:
151+ #
152+ # "If our first or second segment is smaller than 3 characters, and the
153+ # first segment was purely alphas, remove it."
154+ #
155+ # However, the code actually checks to make sure that there are no alphas
156+ # in the segment, rather than checking for purely alphas.
157+ alphas = re .search (r'[a-z]' , segments [- 1 ], re .IGNORECASE )
158+ if index >= (len (segments ) - 2 ) and len (segment ) < 3 and not alphas :
159+ return None
160+ else :
161+ return segment
162+
163+ def clean_segment (segments , index , segment ):
150164 """
151165 Cleans a single segment of a URL to find the base URL. The base URL is as
152166 a reference when evaluating URLs that might be next-page links. Returns a
@@ -158,13 +172,14 @@ def clean_segment(num_segments, index, segment):
158172 clean_segment_ewcms ,
159173 clean_segment_page_number ,
160174 clean_segment_number ,
161- clean_segment_index
175+ clean_segment_index ,
176+ clean_segment_short
162177 ]
163178 cleaned_segment = segment
164179 for func in funcs :
165180 if cleaned_segment is None :
166181 break
167- cleaned_segment = func (num_segments , index , cleaned_segment )
182+ cleaned_segment = func (segments , index , cleaned_segment )
168183 return cleaned_segment
169184
170185
@@ -174,7 +189,7 @@ def filter_none(seq):
174189
175190def clean_segments (segments ):
176191 cleaned = [
177- clean_segment (len ( segments ) , i , s )
192+ clean_segment (segments , i , s )
178193 for i , s in enumerate (segments )
179194 ]
180195 return filter_none (cleaned )
0 commit comments