Mercurial > p > roundup > code
diff roundup/backends/indexer_rdbms.py @ 3331:7bc09d5d9544
perform word splitting in unicode for national characters support
[SF#1195739]
| author | Alexander Smishlajev <a1s@users.sourceforge.net> |
|---|---|
| date | Sun, 22 May 2005 17:55:00 +0000 |
| parents | a615cc230160 |
| children | 89a5c8e86346 |
line wrap: on
line diff
--- a/roundup/backends/indexer_rdbms.py Wed May 18 05:39:21 2005 +0000 +++ b/roundup/backends/indexer_rdbms.py Sun May 22 17:55:00 2005 +0000 @@ -1,4 +1,4 @@ -#$Id: indexer_rdbms.py,v 1.9 2005-04-28 00:21:42 richard Exp $ +#$Id: indexer_rdbms.py,v 1.10 2005-05-22 17:55:00 a1s Exp $ ''' This implements the full-text indexer over two RDBMS tables. The first is a mapping of words to occurance IDs. The second maps the IDs to (Class, propname, itemid) instances. @@ -21,7 +21,7 @@ '''Save the changes to the index.''' # not necessary - the RDBMS connection will handle this for us pass - + def force_reindex(self): '''Force a reindexing of the database. This essentially empties the tables ids and index and sets a flag so @@ -57,7 +57,9 @@ self.db.cursor.execute(sql, (id, )) # ok, find all the words in the text - wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper()) + text = unicode(text, "utf-8", "replace").upper() + wordlist = [w.encode("utf-8", "replace") + for w in re.findall(r'(?u)\b\w{2,25}\b', text)] words = {} for word in wordlist: if is_stopword(word): @@ -79,7 +81,7 @@ '''look up all the words in the wordlist. If none are found return an empty dictionary * more rules here - ''' + ''' if not wordlist: return {}
