Checkpoint of multi-page article work

jcharum · mitechie · commit 62df35570db2 · 2012-04-21T12:26:53.000-04:00
This implements some basic tools needed by the multi-page article algorithm.

Conflicts:

	src/readability_lxml/readability.py
diff --git a/readability/urlfetch.py b/readability/urlfetch.py
@@ -0,0 +1,21 @@
+import urllib2
+
+class UrlFetch():
+    """
+    A class for fetching URLs.  This provides a layer of abstraction that can
+    be easily replaced for testing.
+    """
+
+    def urlread(self, url):
+        return urllib2.urlopen(url).read()
+
+
+class MockUrlFetch(UrlFetch):
+
+    def __init__(self, urldict):
+        self._urldict = urldict
+
+    def urlread(self, url):
+        path = self._urldict[url]
+        with open(path, 'r') as f:
+            return f.read()
diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py
@@ -2,6 +2,7 @@
 import logging
 import re
 import sys
+import urlparse
 
 from collections import defaultdict
 from collections import namedtuple
@@ -93,7 +94,93 @@ def text_length(i):
     return len(clean(i.text_content() or ""))
 
 
+def clean_segment_extension(num_segments, index, segment):
+    if segment.find('.') == -1:
+        return segment
+    else:
+        split_segment = segment.split('.')
+        possible_type = split_segment[1]
+        has_non_alpha = re.search(r'[^a-zA-Z]', possible_type)
+        if has_non_alpha:
+            return segment
+        else:
+            return split_segment[0]
+
+
+def clean_segment_ewcms(num_segments, index, segment):
+    """
+    EW-CMS specific segment cleaning.  Quoth the original source:
+        "EW-CMS specific segment replacement. Ugly.
+         Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html"
+    """
+    return segment.replace(',00', '')
+
 
+def clean_segment_page_number(num_segments, index, segment):
+    # If our first or second segment has anything looking like a page number,
+    # remove it.
+    if index >= (num_segments - 2):
+        pattern = r'((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$'
+        cleaned = re.sub(pattern, '', segment, re.IGNORECASE)
+        if cleaned == '':
+            return None
+        else:
+            return cleaned
+    else:
+        return segment
+
+
+def clean_segment_number(num_segments, index, segment):
+    # If this is purely a number, and it's the first or second segment, it's
+    # probably a page number.  Remove it.
+    if index >= (num_segments - 2) and re.search(r'^\d{1,2}$', segment):
+        return None
+    else:
+        return segment
+
+
+def clean_segment(num_segments, index, segment):
+    """
+    Cleans a single segment of a URL to find the base URL.  The base URL is as
+    a reference when evaluating URLs that might be next-page links.  Returns a
+    cleaned segment string or None, if the segment should be omitted entirely
+    from the base URL.
+    """
+    funcs = [
+            clean_segment_extension,
+            clean_segment_ewcms,
+            clean_segment_page_number,
+            clean_segment_number
+            ]
+    cleaned_segment = segment
+    for func in funcs:
+        if cleaned_segment is None:
+            break
+        cleaned_segment = func(num_segments, index, cleaned_segment)
+    return cleaned_segment
+
+
+def filter_none(seq):
+    return [x for x in seq if x is not None]
+
+
+def clean_segments(segments):
+    cleaned = [
+            clean_segment(len(segments), i, s)
+            for i, s in enumerate(segments)
+            ]
+    return filter_none(cleaned)
+
+
+def find_base_url(url):
+    if url is None:
+        return None
+    parts = urlparse.urlsplit(url)
+    segments = parts.path.split('/')
+    cleaned_segments = clean_segments(segments)
+    new_path = '/'.join(cleaned_segments)
+    new_parts = (parts.scheme, parts.netloc, new_path, '', '')
+    return urlparse.urlunsplit(new_parts)
 
 
 class Document:
@@ -254,9 +341,21 @@ def get_article(self, candidates, best_candidate,
                 append = True
             sibling_key = sibling  # HashableElement(sibling)
             if sibling_key in candidates:
+                # Print out sibling information for debugging.
+                sibling_candidate = candidates[sibling_key]
+                self.debug(
+                        "Sibling: %6.3f %s" %
+                        (sibling_candidate['content_score'], describe(sibling))
+                        )
+
                 sib_threshhold = sibling_score_threshold
                 if candidates[sibling_key]['content_score'] >= sib_threshhold:
                     append = True
+            else:
+                self.debug("Sibling: %s" % describe(sibling))
+
+            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+                append = True
 
             if sibling.tag == "p":
                 link_density = self.get_link_density(sibling)
@@ -314,6 +413,7 @@ def score_paragraphs(self, ):
         candidates = {}
         ordered = []
         for elem in self.tags(self.html, "p", "pre", "td"):
+            self.debug('Scoring %s' % describe(elem))
             parent_node = elem.getparent()
             if parent_node is None:
                 continue
@@ -418,15 +518,9 @@ def remove_unlikely_candidates(self):
 
     def transform_misused_divs_into_paragraphs(self):
         for elem in self.tags(self.html, 'div'):
-            # transform <div>s that do not contain other block elements into
-            # <p>s
-            #FIXME: The current implementation ignores all descendants that
-            # are not direct children of elem
-            # This results in incorrect results in case there is an <img>
-            # buried within an <a> for example
-            if not REGEXES['divToPElementsRe'].search(
-                    unicode(''.join(map(tostring, list(elem))))):
-                #self.debug("Altering %s to p" % (describe(elem)))
+            # transform <div>s that do not contain other block elements into <p>s
+            if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
+                self.debug("Altering %s to p" % (describe(elem)))
                 elem.tag = "p"
                 #print "Fixed element "+describe(elem)
 
@@ -436,6 +530,7 @@ def transform_misused_divs_into_paragraphs(self):
                 p.text = elem.text
                 elem.text = None
                 elem.insert(0, p)
+                self.debug("Appended %s to %s" % (tounicode(p), describe(elem)))
                 #print "Appended "+tounicode(p)+" to "+describe(elem)
 
             for pos, child in reversed(list(enumerate(elem))):
@@ -444,11 +539,16 @@ def transform_misused_divs_into_paragraphs(self):
                     p.text = child.tail
                     child.tail = None
                     elem.insert(pos + 1, p)
+                    self.debug("Inserted %s to %s" % (tounicode(p), describe(elem)))
                     #print "Inserted "+tounicode(p)+" to "+describe(elem)
                 if child.tag == 'br':
                     #print 'Dropped <br> at '+describe(elem)
                     child.drop_tree()
 
+    def findNextPageLink(self, elem):
+        allLinks = self.tags(elem, ['a'])
+        baseUrl = self.find_base_url(self.options['url'])
+
     def tags(self, node, *tag_names):
         for tag_name in tag_names:
             for e in node.findall('.//%s' % tag_name):