Mercurial > p > roundup > code
view website/issues/extensions/spambayes.py @ 5543:bc3e00a3d24b
MySQL backend fixes for Python 3.
With Python 2, text sent to and from MySQL is treated as bytes in
Python. The database may be recorded by MySQL as having some other
encoding (latin1 being the default in some MySQL versions - Roundup
does not set an encoding explicitly, unlike in back_postgresql), but
as long as MySQL's notion of the connection encoding agrees with its
notion of the database encoding, no conversions actually take place
and the bytes are stored and returned as-is.
With Python 3, text sent to and from MySQL is treated as Python
Unicode strings. When the database and connection encoding is latin1,
that means the bytes stored in the database under Python 2 are
interpreted as latin1 and converted from that to Unicode, producing
incorrect results for any non-ASCII characters; furthermore, if trying
to store new non-ASCII data in the database under Python 3, any
non-latin1 characters produce errors.
This patch arranges for both the connection and database character
sets to be UTF-8 when using Python 3, and documents a need to export
and import the database when moving from Python 2 to Python 3 with
this backend.
| author | Joseph Myers <jsm@polyomino.org.uk> |
|---|---|
| date | Sun, 16 Sep 2018 16:19:20 +0000 |
| parents | e46ce04d5bbc |
| children |
line wrap: on
line source
import re, math from roundup.cgi.actions import Action from roundup.cgi.exceptions import * from roundup.anypy import xmlrpc_ import socket REVPAT = re.compile(r'(r[0-9]+\b|rev(ision)? [0-9]+\b)') def extract_classinfo(db, classname, nodeid): node = db.getnode(classname, nodeid) authorage = node['creation'].timestamp() - \ db.getnode('user', node.get('author', node.get('creator')))['creation'].timestamp() authorid = node.get('author', node.get('creator')) content = db.getclass(classname).get(nodeid, 'content') tokens = ["klass:%s" % classname, "author:%s" % authorid, "authorage:%d" % int(math.log(authorage)), "hasrev:%s" % (REVPAT.search(content) is not None)] return (content, tokens) def train_spambayes(db, content, tokens, is_spam): spambayes_uri = db.config.detectors['SPAMBAYES_URI'] server = xmlrpc_.client.ServerProxy(spambayes_uri, verbose=False) try: server.train({'content':content}, tokens, {}, is_spam) return (True, None) except (socket.error, xmlrpc_.client.Error) as e: return (False, str(e)) class SpambayesClassify(Action): permissionType = 'SB: May Classify' def handle(self): (content, tokens) = extract_classinfo(self.db, self.classname, self.nodeid) if "trainspam" in self.form: is_spam = True elif "trainham" in self.form: is_spam = False (status, errmsg) = train_spambayes(self.db, content, tokens, is_spam) node = self.db.getnode(self.classname, self.nodeid) props = {} if status: if node.get('spambayes_misclassified', False): props['spambayes_misclassified'] = True props['spambayes_score'] = 1.0 s = " SPAM" if not is_spam: props['spambayes_score'] = 0.0 s = " HAM" self.client.add_ok_message(self._('Message classified as') + s) else: self.client.add_error_message(self._('Unable to classify message, got error:') + errmsg) klass = self.db.getclass(self.classname) klass.set(self.nodeid, **props) self.db.commit() def sb_is_spam(obj): cutoff_score = float(obj._db.config.detectors['SPAMBAYES_SPAM_CUTOFF']) try: score = obj['spambayes_score'] except KeyError: return False return score >= cutoff_score def init(instance): instance.registerAction("spambayes_classify", SpambayesClassify) instance.registerUtil('sb_is_spam', sb_is_spam)
