diff roundup/backends/indexer_rdbms.py @ 3331:7bc09d5d9544

perform word splitting in unicode for national characters support [SF#1195739]
author Alexander Smishlajev <a1s@users.sourceforge.net>
date Sun, 22 May 2005 17:55:00 +0000
parents a615cc230160
children 89a5c8e86346
line wrap: on
line diff
--- a/roundup/backends/indexer_rdbms.py	Wed May 18 05:39:21 2005 +0000
+++ b/roundup/backends/indexer_rdbms.py	Sun May 22 17:55:00 2005 +0000
@@ -1,4 +1,4 @@
-#$Id: indexer_rdbms.py,v 1.9 2005-04-28 00:21:42 richard Exp $
+#$Id: indexer_rdbms.py,v 1.10 2005-05-22 17:55:00 a1s Exp $
 ''' This implements the full-text indexer over two RDBMS tables. The first
 is a mapping of words to occurance IDs. The second maps the IDs to (Class,
 propname, itemid) instances.
@@ -21,7 +21,7 @@
         '''Save the changes to the index.'''
         # not necessary - the RDBMS connection will handle this for us
         pass
-  
+
     def force_reindex(self):
         '''Force a reindexing of the database.  This essentially
         empties the tables ids and index and sets a flag so
@@ -57,7 +57,9 @@
             self.db.cursor.execute(sql, (id, ))
 
         # ok, find all the words in the text
-        wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
+        text = unicode(text, "utf-8", "replace").upper()
+        wordlist = [w.encode("utf-8", "replace")
+                for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
         words = {}
         for word in wordlist:
             if is_stopword(word):
@@ -79,7 +81,7 @@
         '''look up all the words in the wordlist.
         If none are found return an empty dictionary
         * more rules here
-        '''        
+        '''
         if not wordlist:
             return {}
 

Roundup Issue Tracker: http://roundup-tracker.org/