Mercurial > p > roundup > code
comparison roundup/msgfmt.py @ 5450:f2fade4552c5
replaced msgfmt.py with latest version supporting Python 3
fixed setup scripts for Python 3
| author | Christof Meerwald <cmeerw@cmeerw.org> |
|---|---|
| date | Sat, 21 Jul 2018 16:29:20 +0100 |
| parents | 23b8e6067f7c |
| children | 4d2e1fa03f0f |
comparison
equal
deleted
inserted
replaced
| 5449:ddf1cf299ebc | 5450:f2fade4552c5 |
|---|---|
| 1 #! /usr/bin/env python | 1 #! /usr/bin/env python |
| 2 # -*- coding: iso-8859-1 -*- | 2 # -*- coding: iso-8859-1 -*- |
| 3 # Written by Martin v. Löwis <loewis@informatik.hu-berlin.de> | 3 # Written by Martin v. Loewis <loewis@informatik.hu-berlin.de> |
| 4 # Plural forms support added by alexander smishlajev <alex@tycobka.lv> | 4 # |
| 5 # Changed by Christian 'Tiran' Heimes <tiran@cheimes.de> for the placeless | |
| 6 # translation service (PTS) of Zope | |
| 7 # | |
| 8 # Fixed some bugs and updated to support msgctxt | |
| 9 # by Hanno Schlichting <hanno@hannosch.eu> | |
| 5 | 10 |
| 6 """Generate binary message catalog from textual translation description. | 11 """Generate binary message catalog from textual translation description. |
| 7 | 12 |
| 8 This program converts a textual Uniforum-style message catalog (.po file) into | 13 This program converts a textual Uniforum-style message catalog (.po file) into |
| 9 a binary GNU catalog (.mo file). This is essentially the same function as the | 14 a binary GNU catalog (.mo file). This is essentially the same function as the |
| 10 GNU msgfmt program, however, it is a simpler implementation. | 15 GNU msgfmt program, however, it is a simpler implementation. |
| 11 | 16 |
| 12 Usage: msgfmt.py [OPTIONS] filename.po | 17 This file was taken from Python-2.3.2/Tools/i18n and altered in several ways. |
| 13 | 18 Now you can simply use it from another python module: |
| 14 Options: | 19 |
| 15 -o file | 20 from msgfmt import Msgfmt |
| 16 --output-file=file | 21 mo = Msgfmt(po).get() |
| 17 Specify the output file to write to. If omitted, output will go to a | 22 |
| 18 file named filename.mo (based off the input file name). | 23 where po is path to a po file as string, an opened po file ready for reading or |
| 19 | 24 a list of strings (readlines of a po file) and mo is the compiled mo file as |
| 20 -h | 25 binary string. |
| 21 --help | 26 |
| 22 Print this message and exit. | 27 Exceptions: |
| 23 | 28 |
| 24 -V | 29 * IOError if the file couldn't be read |
| 25 --version | 30 |
| 26 Display version information and exit. | 31 * msgfmt.PoSyntaxError if the po file has syntax errors |
| 27 """ | 32 """ |
| 28 | 33 |
| 29 from __future__ import print_function | 34 import array |
| 35 from ast import literal_eval | |
| 36 import codecs | |
| 37 from email.parser import HeaderParser | |
| 38 import struct | |
| 30 import sys | 39 import sys |
| 31 import os | 40 |
| 32 import getopt | 41 PY3 = sys.version_info[0] == 3 |
| 33 import struct | 42 if PY3: |
| 34 import array | 43 def header_charset(s): |
| 35 | 44 p = HeaderParser() |
| 36 __version__ = "1.1" | 45 return p.parsestr(s).get_content_charset() |
| 37 | 46 |
| 38 MESSAGES = {} | 47 import io |
| 39 | 48 BytesIO = io.BytesIO |
| 40 | 49 FILE_TYPE = io.IOBase |
| 41 | 50 else: |
| 42 def usage(code, msg=''): | 51 def header_charset(s): |
| 43 print(__doc__, file=sys.stderr) | 52 p = HeaderParser() |
| 44 if msg: | 53 return p.parsestr(s.encode('utf-8', 'ignore')).get_content_charset() |
| 45 print(msg, file=sys.stderr) | 54 |
| 46 sys.exit(code) | 55 from cStringIO import StringIO as BytesIO |
| 47 | 56 FILE_TYPE = file |
| 48 | 57 |
| 49 | 58 |
| 50 def add(id, str, fuzzy): | 59 class PoSyntaxError(Exception): |
| 51 "Add a non-fuzzy translation to the dictionary." | 60 """ Syntax error in a po file """ |
| 52 global MESSAGES | 61 |
| 53 if not fuzzy and str and not str.startswith('\0'): | 62 def __init__(self, msg): |
| 54 MESSAGES[id] = str | 63 self.msg = msg |
| 55 | 64 |
| 56 | 65 def __str__(self): |
| 57 | 66 return 'Po file syntax error: %s' % self.msg |
| 58 def generate(): | 67 |
| 59 "Return the generated output." | 68 |
| 60 global MESSAGES | 69 class Msgfmt: |
| 61 # the keys are sorted in the .mo file | 70 |
| 62 keys = sorted(MESSAGES.keys()) | 71 def __init__(self, po, name='unknown'): |
| 63 offsets = [] | 72 self.po = po |
| 64 ids = strs = '' | 73 self.name = name |
| 65 for id in keys: | 74 self.messages = {} |
| 66 # For each string, we need size and file offset. Each string is NUL | 75 self.openfile = False |
| 67 # terminated; the NUL does not count into the size. | 76 # Start off assuming latin-1, so everything decodes without failure, |
| 68 offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) | 77 # until we know the exact encoding |
| 69 ids += id + '\0' | 78 self.encoding = 'latin-1' |
| 70 strs += MESSAGES[id] + '\0' | 79 |
| 71 output = '' | 80 def readPoData(self): |
| 72 # The header is 7 32-bit unsigned integers. We don't use hash tables, so | 81 """ read po data from self.po and return an iterator """ |
| 73 # the keys start right after the index tables. | 82 output = [] |
| 74 # translated string. | 83 if isinstance(self.po, str): |
| 75 keystart = 7*4+16*len(keys) | 84 output = open(self.po, 'rb') |
| 76 # and the values start after the keys | 85 elif isinstance(self.po, FILE_TYPE): |
| 77 valuestart = keystart + len(ids) | 86 self.po.seek(0) |
| 78 koffsets = [] | 87 self.openfile = True |
| 79 voffsets = [] | 88 output = self.po |
| 80 # The string table first has the list of keys, then the list of values. | 89 elif isinstance(self.po, list): |
| 81 # Each entry has first the size of the string, then the file offset. | 90 output = self.po |
| 82 for o1, l1, o2, l2 in offsets: | 91 if not output: |
| 83 koffsets += [l1, o1+keystart] | 92 raise ValueError("self.po is invalid! %s" % type(self.po)) |
| 84 voffsets += [l2, o2+valuestart] | 93 if isinstance(output, FILE_TYPE): |
| 85 offsets = koffsets + voffsets | 94 # remove BOM from the start of the parsed input |
| 86 output = struct.pack("Iiiiiii", | 95 first = output.readline() |
| 87 0x950412de, # Magic | 96 if len(first) == 0: |
| 88 0, # Version | 97 return output.readlines() |
| 89 len(keys), # # of entries | 98 if first.startswith(codecs.BOM_UTF8): |
| 90 7*4, # start of key index | 99 first = first.lstrip(codecs.BOM_UTF8) |
| 91 7*4+len(keys)*8, # start of value index | 100 return [first] + output.readlines() |
| 92 0, 0) # size and offset of hash table | 101 return output |
| 93 output += array.array("i", offsets).tostring() | 102 |
| 94 output += ids | 103 def add(self, context, id, string, fuzzy): |
| 95 output += strs | 104 "Add a non-empty and non-fuzzy translation to the dictionary." |
| 96 return output | 105 if string and not fuzzy: |
| 97 | 106 # The context is put before the id and separated by a EOT char. |
| 98 | 107 if context: |
| 99 | 108 id = context + u'\x04' + id |
| 100 def make(filename, outfile): | 109 if not id: |
| 101 ID = 1 | 110 # See whether there is an encoding declaration |
| 102 STR = 2 | 111 charset = header_charset(string) |
| 103 global MESSAGES | 112 if charset: |
| 104 MESSAGES = {} | 113 # decode header in proper encoding |
| 105 | 114 string = string.encode(self.encoding).decode(charset) |
| 106 msgid = None | 115 if not PY3: |
| 107 msgstr = None | 116 # undo damage done by literal_eval in Python 2.x |
| 108 | 117 string = string.encode(self.encoding).decode(charset) |
| 109 # Compute .mo name from .po name and arguments | 118 self.encoding = charset |
| 110 if filename.endswith('.po'): | 119 self.messages[id] = string |
| 111 infile = filename | 120 |
| 112 else: | 121 def generate(self): |
| 113 infile = filename + '.po' | 122 "Return the generated output." |
| 114 if outfile is None: | 123 # the keys are sorted in the .mo file |
| 115 outfile = os.path.splitext(infile)[0] + '.mo' | 124 keys = sorted(self.messages.keys()) |
| 116 | 125 offsets = [] |
| 117 try: | 126 ids = strs = b'' |
| 118 lines = open(infile).readlines() | 127 for id in keys: |
| 119 except IOError as msg: | 128 msg = self.messages[id].encode(self.encoding) |
| 120 print(msg, file=sys.stderr) | 129 id = id.encode(self.encoding) |
| 121 sys.exit(1) | 130 # For each string, we need size and file offset. Each string is |
| 122 | 131 # NUL terminated; the NUL does not count into the size. |
| 123 # remove UTF-8 Byte Order Mark, if any. | 132 offsets.append((len(ids), len(id), len(strs), |
| 124 # (UCS2 BOMs are not handled because messages in UCS2 cannot be handled) | 133 len(msg))) |
| 125 if lines[0].startswith('\xEF\xBB\xBF'): | 134 ids += id + b'\0' |
| 126 lines[0] = lines[0][3:] | 135 strs += msg + b'\0' |
| 127 | 136 output = b'' |
| 128 section = None | 137 # The header is 7 32-bit unsigned integers. We don't use hash tables, |
| 129 fuzzy = 0 | 138 # so the keys start right after the index tables. |
| 130 | 139 keystart = 7 * 4 + 16 * len(keys) |
| 131 # Parse the catalog | 140 # and the values start after the keys |
| 132 lno = 0 | 141 valuestart = keystart + len(ids) |
| 133 for l in lines: | 142 koffsets = [] |
| 134 lno += 1 | 143 voffsets = [] |
| 135 # If we get a comment line after a msgstr, this is a new entry | 144 # The string table first has the list of keys, then the list of values. |
| 136 if l[0] == '#' and section == STR: | 145 # Each entry has first the size of the string, then the file offset. |
| 137 add(msgid, msgstr, fuzzy) | 146 for o1, l1, o2, l2 in offsets: |
| 138 section = None | 147 koffsets += [l1, o1 + keystart] |
| 139 fuzzy = 0 | 148 voffsets += [l2, o2 + valuestart] |
| 140 # Record a fuzzy mark | 149 offsets = koffsets + voffsets |
| 141 if l[:2] == '#,' and (l.find('fuzzy') >= 0): | 150 # Even though we don't use a hashtable, we still set its offset to be |
| 142 fuzzy = 1 | 151 # binary compatible with the gnu gettext format produced by: |
| 143 # Skip comments | 152 # msgfmt file.po --no-hash |
| 144 if l[0] == '#': | 153 output = struct.pack("Iiiiiii", |
| 145 continue | 154 0x950412de, # Magic |
| 146 # Start of msgid_plural section, separate from singular form with \0 | 155 0, # Version |
| 147 if l.startswith('msgid_plural'): | 156 len(keys), # # of entries |
| 148 msgid += '\0' | 157 7 * 4, # start of key index |
| 149 l = l[12:] | 158 7 * 4 + len(keys) * 8, # start of value index |
| 150 # Now we are in a msgid section, output previous section | 159 0, keystart) # size and offset of hash table |
| 151 elif l.startswith('msgid'): | 160 if PY3: |
| 152 if section == STR: | 161 output += array.array("i", offsets).tobytes() |
| 153 add(msgid, msgstr, fuzzy) | |
| 154 section = ID | |
| 155 l = l[5:] | |
| 156 msgid = msgstr = '' | |
| 157 # Now we are in a msgstr section | |
| 158 elif l.startswith('msgstr'): | |
| 159 section = STR | |
| 160 l = l[6:] | |
| 161 # Check for plural forms | |
| 162 if l.startswith('['): | |
| 163 # Separate plural forms with \0 | |
| 164 if not l.startswith('[0]'): | |
| 165 msgstr += '\0' | |
| 166 # Ignore the index - must come in sequence | |
| 167 l = l[l.index(']') + 1:] | |
| 168 # Skip empty lines | |
| 169 l = l.strip() | |
| 170 if not l: | |
| 171 continue | |
| 172 # XXX: Does this always follow Python escape semantics? | |
| 173 l = eval(l) | |
| 174 if section == ID: | |
| 175 msgid += l | |
| 176 elif section == STR: | |
| 177 msgstr += l | |
| 178 else: | 162 else: |
| 179 print('Syntax error on %s:%d' % (infile, lno), | 163 output += array.array("i", offsets).tostring() |
| 180 'before:', file=sys.stderr) | 164 output += ids |
| 181 print(l, file=sys.stderr) | 165 output += strs |
| 182 sys.exit(1) | 166 return output |
| 183 # Add last entry | 167 |
| 184 if section == STR: | 168 def get(self): |
| 185 add(msgid, msgstr, fuzzy) | 169 """ """ |
| 186 | 170 self.read() |
| 187 # Compute output | 171 # Compute output |
| 188 output = generate() | 172 return self.generate() |
| 189 | 173 |
| 190 try: | 174 def read(self, header_only=False): |
| 191 open(outfile,"wb").write(output) | 175 """ """ |
| 192 except IOError as msg: | 176 ID = 1 |
| 193 print(msg, file=sys.stderr) | 177 STR = 2 |
| 194 | 178 CTXT = 3 |
| 195 | 179 |
| 196 | 180 section = None |
| 197 def main(): | 181 fuzzy = 0 |
| 198 try: | 182 msgid = msgstr = msgctxt = u'' |
| 199 opts, args = getopt.getopt(sys.argv[1:], 'hVo:', | 183 |
| 200 ['help', 'version', 'output-file=']) | 184 # Parse the catalog |
| 201 except getopt.error as msg: | 185 lno = 0 |
| 202 usage(1, msg) | 186 for l in self.readPoData(): |
| 203 | 187 l = l.decode(self.encoding) |
| 204 outfile = None | 188 lno += 1 |
| 205 # parse options | 189 # If we get a comment line after a msgstr or a line starting with |
| 206 for opt, arg in opts: | 190 # msgid or msgctxt, this is a new entry |
| 207 if opt in ('-h', '--help'): | 191 if section == STR and (l[0] == '#' or (l[0] == 'm' and |
| 208 usage(0) | 192 (l.startswith('msgctxt') or l.startswith('msgid')))): |
| 209 elif opt in ('-V', '--version'): | 193 self.add(msgctxt, msgid, msgstr, fuzzy) |
| 210 print("msgfmt.py", __version__, file=sys.stderr) | 194 section = None |
| 211 sys.exit(0) | 195 fuzzy = 0 |
| 212 elif opt in ('-o', '--output-file'): | 196 # If we only want the header we stop after the first message |
| 213 outfile = arg | 197 if header_only: |
| 214 # do it | 198 break |
| 215 if not args: | 199 # Record a fuzzy mark |
| 216 print('No input file given', file=sys.stderr) | 200 if l[:2] == '#,' and 'fuzzy' in l: |
| 217 print("Try `msgfmt --help' for more information.", file=sys.stderr) | 201 fuzzy = 1 |
| 218 return | 202 # Skip comments |
| 219 | 203 if l[0] == '#': |
| 220 for filename in args: | 204 continue |
| 221 make(filename, outfile) | 205 # Now we are in a msgctxt section |
| 222 | 206 if l.startswith('msgctxt'): |
| 223 | 207 section = CTXT |
| 224 if __name__ == '__main__': | 208 l = l[7:] |
| 225 main() | 209 msgctxt = u'' |
| 226 | 210 # Now we are in a msgid section, output previous section |
| 227 # vim: set et sts=4 sw=4 : | 211 elif (l.startswith('msgid') and |
| 212 not l.startswith('msgid_plural')): | |
| 213 if section == STR: | |
| 214 self.add(msgid, msgstr, fuzzy) | |
| 215 section = ID | |
| 216 l = l[5:] | |
| 217 msgid = msgstr = u'' | |
| 218 is_plural = False | |
| 219 # This is a message with plural forms | |
| 220 elif l.startswith('msgid_plural'): | |
| 221 if section != ID: | |
| 222 raise PoSyntaxError( | |
| 223 'msgid_plural not preceeded by ' | |
| 224 'msgid on line %d of po file %s' % | |
| 225 (lno, repr(self.name))) | |
| 226 l = l[12:] | |
| 227 msgid += u'\0' # separator of singular and plural | |
| 228 is_plural = True | |
| 229 # Now we are in a msgstr section | |
| 230 elif l.startswith('msgstr'): | |
| 231 section = STR | |
| 232 if l.startswith('msgstr['): | |
| 233 if not is_plural: | |
| 234 raise PoSyntaxError( | |
| 235 'plural without msgid_plural ' | |
| 236 'on line %d of po file %s' % | |
| 237 (lno, repr(self.name))) | |
| 238 l = l.split(']', 1)[1] | |
| 239 if msgstr: | |
| 240 # Separator of the various plural forms | |
| 241 msgstr += u'\0' | |
| 242 else: | |
| 243 if is_plural: | |
| 244 raise PoSyntaxError( | |
| 245 'indexed msgstr required for ' | |
| 246 'plural on line %d of po file %s' % | |
| 247 (lno, repr(self.name))) | |
| 248 l = l[6:] | |
| 249 # Skip empty lines | |
| 250 l = l.strip() | |
| 251 if not l: | |
| 252 continue | |
| 253 # TODO: Does this always follow Python escape semantics? | |
| 254 try: | |
| 255 l = literal_eval(l) | |
| 256 except Exception as msg: | |
| 257 raise PoSyntaxError( | |
| 258 '%s (line %d of po file %s): \n%s' % | |
| 259 (msg, lno, repr(self.name), l)) | |
| 260 if isinstance(l, bytes): | |
| 261 l = l.decode(self.encoding) | |
| 262 if section == CTXT: | |
| 263 msgctxt += l | |
| 264 elif section == ID: | |
| 265 msgid += l | |
| 266 elif section == STR: | |
| 267 msgstr += l | |
| 268 else: | |
| 269 raise PoSyntaxError( | |
| 270 'error on line %d of po file %s' % | |
| 271 (lno, repr(self.name))) | |
| 272 | |
| 273 # Add last entry | |
| 274 if section == STR: | |
| 275 self.add(msgctxt, msgid, msgstr, fuzzy) | |
| 276 | |
| 277 if self.openfile: | |
| 278 self.po.close() | |
| 279 | |
| 280 def getAsFile(self): | |
| 281 return BytesIO(self.get()) |
