diff roundup/backends/indexer_rdbms.py @ 3333:3096c4b10960 maint-0.8

perform word splitting in unicode for national characters support [SF#1195739]
author Alexander Smishlajev <a1s@users.sourceforge.net>
date Sun, 22 May 2005 18:02:00 +0000
parents a7045bad20de
children b24b75f2a728
line wrap: on
line diff
--- a/roundup/backends/indexer_rdbms.py	Wed May 18 05:40:43 2005 +0000
+++ b/roundup/backends/indexer_rdbms.py	Sun May 22 18:02:00 2005 +0000
@@ -15,7 +15,7 @@
         '''close the indexing database'''
         # just nuke the circular reference
         self.db = None
-  
+
     def force_reindex(self):
         '''Force a reindexing of the database.  This essentially
         empties the tables ids and index and sets a flag so
@@ -51,7 +51,9 @@
             self.db.cursor.execute(sql, (id, ))
 
         # ok, find all the words in the text
-        wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
+        text = unicode(text, "utf-8", "replace").upper()
+        wordlist = [w.encode("utf-8", "replace")
+                for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
         words = {}
         for word in wordlist:
             if is_stopword(word):
@@ -73,7 +75,7 @@
         '''look up all the words in the wordlist.
         If none are found return an empty dictionary
         * more rules here
-        '''        
+        '''
         if not wordlist:
             return {}
 

Roundup Issue Tracker: http://roundup-tracker.org/