changeset 3991:13161539e5bd

improved URL matching
author Richard Jones <richard@users.sourceforge.net>
date Mon, 18 Aug 2008 06:03:06 +0000
parents 0728808fdf5c
children fe2af84a5ca5
files CHANGES.txt roundup/cgi/templating.py test/test_templating.py
diffstat 3 files changed, 56 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/CHANGES.txt	Mon Aug 18 05:09:00 2008 +0000
+++ b/CHANGES.txt	Mon Aug 18 06:03:06 2008 +0000
@@ -19,6 +19,7 @@
 - Prevent broken pipe errors in csv export (sf patch #1911449)
 - Session API and cleanup thanks anatoly t.
 - Make WSGI handler threadsafe (sf #1968027)
+- Improved URL matching RE (sf #2038858)
 
 
 2008-03-01 1.4.4
--- a/roundup/cgi/templating.py	Mon Aug 18 05:09:00 2008 +0000
+++ b/roundup/cgi/templating.py	Mon Aug 18 06:03:06 2008 +0000
@@ -1245,9 +1245,19 @@
         return self.is_edit_ok()
 
 class StringHTMLProperty(HTMLProperty):
-    hyper_re = re.compile(r'((?P<url>\w{3,6}://\S+)|'
-                          r'(?P<email>[-+=%/\w\.]+@[\w\.\-]+)|'
-                          r'(?P<item>(?P<class>[A-Za-z_]+)(\s*)(?P<id>\d+)))')
+    hyper_re = re.compile(r'''(
+        (?P<url>
+          ((ht|f)tp(s?)://|www\.)?          # prefix
+          ([\w]+:\w+@)?                     # username/password
+          (([\w\-]+\.)+([\w]{2,5}))         # hostname
+          (:[\d]{1,5})?                     # port
+          (/[\w\-$.+!*(),;:@&=?/~\\#%]*)?   # path etc.
+        )|
+        (?P<email>[-+=%/\w\.]+@[\w\.\-]+)|
+        (?P<item>(?P<class>[A-Za-z_]+)(\s*)(?P<id>\d+))
+    )''', re.X | re.I)
+    protocol_re = re.compile('^(ht|f)tp(s?)://', re.I)
+
     def _hyper_repl_item(self,match,replacement):
         item = match.group('item')
         cls = match.group('class').lower()
@@ -1263,8 +1273,16 @@
 
     def _hyper_repl(self, match):
         if match.group('url'):
-            s = match.group('url')
-            return '<a href="%s">%s</a>'%(s, s)
+            u = s = match.group('url')
+            if not self.protocol_re.search(s):
+                u = 'http://' + s
+            # catch an escaped ">" at the end of the URL
+            if s.endswith('&gt;'):
+                u = s = s[:-4]
+                e = '&gt;'
+            else:
+                e = ''
+            return '<a href="%s">%s</a>%s'%(u, s, e)
         elif match.group('email'):
             s = match.group('email')
             return '<a href="mailto:%s">%s</a>'%(s, s)
--- a/test/test_templating.py	Mon Aug 18 05:09:00 2008 +0000
+++ b/test/test_templating.py	Mon Aug 18 06:03:06 2008 +0000
@@ -88,6 +88,38 @@
         cls = HTMLClass(self.client, "issue")
         cls["nosy"]
 
+    def test_url_match(self):
+        '''Test the URL regular expression in StringHTMLProperty.
+        '''
+        def t(s, **groups):
+            m = StringHTMLProperty.hyper_re.search(s)
+            self.assertNotEquals(m, None, '%r did not match'%s)
+            d = m.groupdict()
+            for g in groups:
+                self.assertEquals(d[g], groups[g], '%s %r != %r in %r'%(g, d[g],
+                    groups[g], s))
+
+        #t('123.321.123.321', 'url')
+        t('http://roundup.net/', url='http://roundup.net/')
+        t('<HTTP://roundup.net/>', url='HTTP://roundup.net/')
+        t('www.a.ex', url='www.a.ex')
+        t('http://a.ex', url='http://a.ex')
+        t('http://a.ex/?foo&bar=baz\\.@!$%()qwerty',
+            url='http://a.ex/?foo&bar=baz\\.@!$%()qwerty')
+        t('www.net', url='www.net')
+        t('richard@com.example', email='richard@com.example')
+        t('r@a.com', email='r@a.com')
+        t('i1', **{'class':'i', 'id':'1'})
+        t('item123', **{'class':'item', 'id':'123'})
+
+    def test_url_replace(self):
+        p = StringHTMLProperty(self.client, 'test', '1', None, 'test', '')
+        def t(s): return p.hyper_re.sub(p._hyper_repl, s)
+        ae = self.assertEquals
+        ae(t('http://roundup.net/'), '<a href="http://roundup.net/">http://roundup.net/</a>')
+        ae(t('&lt;HTTP://roundup.net/&gt;'), '&lt;<a href="HTTP://roundup.net/">HTTP://roundup.net/</a>&gt;')
+        ae(t('&lt;www.roundup.net&gt;'), '&lt;<a href="http://www.roundup.net">www.roundup.net</a>&gt;')
+
 '''
 class HTMLPermissions:
     def is_edit_ok(self):

Roundup Issue Tracker: http://roundup-tracker.org/