22import logging
33import re
44import sys
5+ import urlparse
56
67from collections import defaultdict
78from collections import namedtuple
@@ -93,7 +94,93 @@ def text_length(i):
9394 return len (clean (i .text_content () or "" ))
9495
9596
97+ def clean_segment_extension (num_segments , index , segment ):
98+ if segment .find ('.' ) == - 1 :
99+ return segment
100+ else :
101+ split_segment = segment .split ('.' )
102+ possible_type = split_segment [1 ]
103+ has_non_alpha = re .search (r'[^a-zA-Z]' , possible_type )
104+ if has_non_alpha :
105+ return segment
106+ else :
107+ return split_segment [0 ]
108+
109+
110+ def clean_segment_ewcms (num_segments , index , segment ):
111+ """
112+ EW-CMS specific segment cleaning. Quoth the original source:
113+ "EW-CMS specific segment replacement. Ugly.
114+ Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html"
115+ """
116+ return segment .replace (',00' , '' )
117+
96118
119+ def clean_segment_page_number (num_segments , index , segment ):
120+ # If our first or second segment has anything looking like a page number,
121+ # remove it.
122+ if index >= (num_segments - 2 ):
123+ pattern = r'((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$'
124+ cleaned = re .sub (pattern , '' , segment , re .IGNORECASE )
125+ if cleaned == '' :
126+ return None
127+ else :
128+ return cleaned
129+ else :
130+ return segment
131+
132+
133+ def clean_segment_number (num_segments , index , segment ):
134+ # If this is purely a number, and it's the first or second segment, it's
135+ # probably a page number. Remove it.
136+ if index >= (num_segments - 2 ) and re .search (r'^\d{1,2}$' , segment ):
137+ return None
138+ else :
139+ return segment
140+
141+
142+ def clean_segment (num_segments , index , segment ):
143+ """
144+ Cleans a single segment of a URL to find the base URL. The base URL is as
145+ a reference when evaluating URLs that might be next-page links. Returns a
146+ cleaned segment string or None, if the segment should be omitted entirely
147+ from the base URL.
148+ """
149+ funcs = [
150+ clean_segment_extension ,
151+ clean_segment_ewcms ,
152+ clean_segment_page_number ,
153+ clean_segment_number
154+ ]
155+ cleaned_segment = segment
156+ for func in funcs :
157+ if cleaned_segment is None :
158+ break
159+ cleaned_segment = func (num_segments , index , cleaned_segment )
160+ return cleaned_segment
161+
162+
163+ def filter_none (seq ):
164+ return [x for x in seq if x is not None ]
165+
166+
167+ def clean_segments (segments ):
168+ cleaned = [
169+ clean_segment (len (segments ), i , s )
170+ for i , s in enumerate (segments )
171+ ]
172+ return filter_none (cleaned )
173+
174+
175+ def find_base_url (url ):
176+ if url is None :
177+ return None
178+ parts = urlparse .urlsplit (url )
179+ segments = parts .path .split ('/' )
180+ cleaned_segments = clean_segments (segments )
181+ new_path = '/' .join (cleaned_segments )
182+ new_parts = (parts .scheme , parts .netloc , new_path , '' , '' )
183+ return urlparse .urlunsplit (new_parts )
97184
98185
99186class Document :
@@ -254,9 +341,21 @@ def get_article(self, candidates, best_candidate,
254341 append = True
255342 sibling_key = sibling # HashableElement(sibling)
256343 if sibling_key in candidates :
344+ # Print out sibling information for debugging.
345+ sibling_candidate = candidates [sibling_key ]
346+ self .debug (
347+ "Sibling: %6.3f %s" %
348+ (sibling_candidate ['content_score' ], describe (sibling ))
349+ )
350+
257351 sib_threshhold = sibling_score_threshold
258352 if candidates [sibling_key ]['content_score' ] >= sib_threshhold :
259353 append = True
354+ else :
355+ self .debug ("Sibling: %s" % describe (sibling ))
356+
357+ if sibling_key in candidates and candidates [sibling_key ]['content_score' ] >= sibling_score_threshold :
358+ append = True
260359
261360 if sibling .tag == "p" :
262361 link_density = self .get_link_density (sibling )
@@ -314,6 +413,7 @@ def score_paragraphs(self, ):
314413 candidates = {}
315414 ordered = []
316415 for elem in self .tags (self .html , "p" , "pre" , "td" ):
416+ self .debug ('Scoring %s' % describe (elem ))
317417 parent_node = elem .getparent ()
318418 if parent_node is None :
319419 continue
@@ -418,15 +518,9 @@ def remove_unlikely_candidates(self):
418518
419519 def transform_misused_divs_into_paragraphs (self ):
420520 for elem in self .tags (self .html , 'div' ):
421- # transform <div>s that do not contain other block elements into
422- # <p>s
423- #FIXME: The current implementation ignores all descendants that
424- # are not direct children of elem
425- # This results in incorrect results in case there is an <img>
426- # buried within an <a> for example
427- if not REGEXES ['divToPElementsRe' ].search (
428- unicode ('' .join (map (tostring , list (elem ))))):
429- #self.debug("Altering %s to p" % (describe(elem)))
521+ # transform <div>s that do not contain other block elements into <p>s
522+ if not REGEXES ['divToPElementsRe' ].search (unicode ('' .join (map (tostring , list (elem ))))):
523+ self .debug ("Altering %s to p" % (describe (elem )))
430524 elem .tag = "p"
431525 #print "Fixed element "+describe(elem)
432526
@@ -436,6 +530,7 @@ def transform_misused_divs_into_paragraphs(self):
436530 p .text = elem .text
437531 elem .text = None
438532 elem .insert (0 , p )
533+ self .debug ("Appended %s to %s" % (tounicode (p ), describe (elem )))
439534 #print "Appended "+tounicode(p)+" to "+describe(elem)
440535
441536 for pos , child in reversed (list (enumerate (elem ))):
@@ -444,11 +539,16 @@ def transform_misused_divs_into_paragraphs(self):
444539 p .text = child .tail
445540 child .tail = None
446541 elem .insert (pos + 1 , p )
542+ self .debug ("Inserted %s to %s" % (tounicode (p ), describe (elem )))
447543 #print "Inserted "+tounicode(p)+" to "+describe(elem)
448544 if child .tag == 'br' :
449545 #print 'Dropped <br> at '+describe(elem)
450546 child .drop_tree ()
451547
548+ def findNextPageLink (self , elem ):
549+ allLinks = self .tags (elem , ['a' ])
550+ baseUrl = self .find_base_url (self .options ['url' ])
551+
452552 def tags (self , node , * tag_names ):
453553 for tag_name in tag_names :
454554 for e in node .findall ('.//%s' % tag_name ):
0 commit comments