Mercurial > p > roundup > code
diff roundup/backends/indexer_rdbms.py @ 3333:3096c4b10960 maint-0.8
perform word splitting in unicode for national characters support
[SF#1195739]
| author | Alexander Smishlajev <a1s@users.sourceforge.net> |
|---|---|
| date | Sun, 22 May 2005 18:02:00 +0000 |
| parents | a7045bad20de |
| children | b24b75f2a728 |
line wrap: on
line diff
--- a/roundup/backends/indexer_rdbms.py Wed May 18 05:40:43 2005 +0000 +++ b/roundup/backends/indexer_rdbms.py Sun May 22 18:02:00 2005 +0000 @@ -15,7 +15,7 @@ '''close the indexing database''' # just nuke the circular reference self.db = None - + def force_reindex(self): '''Force a reindexing of the database. This essentially empties the tables ids and index and sets a flag so @@ -51,7 +51,9 @@ self.db.cursor.execute(sql, (id, )) # ok, find all the words in the text - wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper()) + text = unicode(text, "utf-8", "replace").upper() + wordlist = [w.encode("utf-8", "replace") + for w in re.findall(r'(?u)\b\w{2,25}\b', text)] words = {} for word in wordlist: if is_stopword(word): @@ -73,7 +75,7 @@ '''look up all the words in the wordlist. If none are found return an empty dictionary * more rules here - ''' + ''' if not wordlist: return {}
