view roundup/msgfmt.py @ 7752:b2dbab2b34bc

fix(refactor): multiple fixups using ruff linter; more testing. Converting to using the ruff linter and its rulesets. Fixed a number of issues. admin.py: sort imports use immutable tuples as default value markers for parameters where a None value is valid. reduced some loops to list comprehensions for performance used ternary to simplify some if statements named some variables to make them less magic (e.g. _default_savepoint_setting = 1000) fixed some tests for argument counts < 2 becomes != 2 so 3 is an error. moved exception handlers outside of loops for performance where exception handler will abort loop anyway. renamed variables called 'id' or 'dir' as they shadow builtin commands. fix translations of form _("string %s" % value) -> _("string %s") % value so translation will be looked up with the key before substitution. end dicts, tuples with a trailing comma to reduce missing comma errors if modified simplified sorted(list(self.setting.keys())) to sorted(self.setting.keys()) as sorted consumes whole list. in if conditions put compared variable on left and threshold condition on right. (no yoda conditions) multiple noqa: suppression removed unneeded noqa as lint rulesets are a bit different do_get - refactor output printing logic: Use fast return if not special formatting is requested; use isinstance with a tuple rather than two isinstance calls; cleaned up flow and removed comments on algorithm as it can be easily read from the code. do_filter, do_find - refactor output printing logic. Reduce duplicate code. do_find - renamed variable 'value' that was set inside a loop. The loop index variable was also named 'value'. do_pragma - added hint to use list subcommand if setting was not found. Replaced condition 'type(x) is bool' with 'isinstance(x, bool)' for various types. test_admin.py added testing for do_list better test coverage for do_get includes: -S and -d for multilinks, error case for -d with non-link. better testing for do_find including all output modes better testing for do_filter including all output modes fixed expected output for do_pragma that now includes hint to use pragma list if setting not found.
author John Rouillard <rouilj@ieee.org>
date Fri, 01 Mar 2024 14:53:18 -0500
parents 4d2e1fa03f0f
children 6a1c1cd69582
line wrap: on
line source

#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# Written by Martin v. Loewis <loewis@informatik.hu-berlin.de>
#
# Changed by Christian 'Tiran' Heimes <tiran@cheimes.de> for the placeless
# translation service (PTS) of Zope
#
# Fixed some bugs and updated to support msgctxt
# by Hanno Schlichting <hanno@hannosch.eu>

"""Generate binary message catalog from textual translation description.

This program converts a textual Uniforum-style message catalog (.po file) into
a binary GNU catalog (.mo file). This is essentially the same function as the
GNU msgfmt program, however, it is a simpler implementation.

This file was taken from Python-2.3.2/Tools/i18n and altered in several ways.
Now you can simply use it from another python module:

  from msgfmt import Msgfmt
  mo = Msgfmt(po).get()

where po is path to a po file as string, an opened po file ready for reading or
a list of strings (readlines of a po file) and mo is the compiled mo file as
binary string.

Exceptions:

  * IOError if the file couldn't be read

  * msgfmt.PoSyntaxError if the po file has syntax errors
"""

import array
from ast import literal_eval
import codecs
from email.parser import HeaderParser
import struct
import sys

PY3 = sys.version_info[0] == 3
if PY3:
    def header_charset(s):
        p = HeaderParser()
        return p.parsestr(s).get_content_charset()

    import io
    BytesIO = io.BytesIO
    FILE_TYPE = io.IOBase
else:
    def header_charset(s):
        p = HeaderParser()
        return p.parsestr(s.encode('utf-8', 'ignore')).get_content_charset()

    from cStringIO import StringIO as BytesIO
    # file is a type defined only under python 2.
    # Flake8 when run in py3 flags this.
    FILE_TYPE = file  # noqa: 821  


class PoSyntaxError(Exception):
    """ Syntax error in a po file """

    def __init__(self, msg):
        self.msg = msg

    def __str__(self):
        return 'Po file syntax error: %s' % self.msg


class Msgfmt:

    def __init__(self, po, name='unknown'):
        self.po = po
        self.name = name
        self.messages = {}
        self.openfile = False
        # Start off assuming latin-1, so everything decodes without failure,
        # until we know the exact encoding
        self.encoding = 'latin-1'

    def readPoData(self):
        """ read po data from self.po and return an iterator """
        output = []
        if isinstance(self.po, str):
            output = open(self.po, 'rb')
        elif isinstance(self.po, FILE_TYPE):
            self.po.seek(0)
            self.openfile = True
            output = self.po
        elif isinstance(self.po, list):
            output = self.po
        if not output:
            raise ValueError("self.po is invalid! %s" % type(self.po))
        if isinstance(output, FILE_TYPE):
            # remove BOM from the start of the parsed input
            first = output.readline()
            if len(first) == 0:
                return output.readlines()
            if first.startswith(codecs.BOM_UTF8):
                first = first.lstrip(codecs.BOM_UTF8)
            return [first] + output.readlines()
        return output

    def add(self, context, id, string, fuzzy):
        "Add a non-empty and non-fuzzy translation to the dictionary."
        if string and not fuzzy:
            # The context is put before the id and separated by a EOT char.
            if context:
                id = context + u'\x04' + id
            if not id:
                # See whether there is an encoding declaration
                charset = header_charset(string)
                if charset:
                    # decode header in proper encoding
                    string = string.encode(self.encoding).decode(charset)
                    if not PY3:
                        # undo damage done by literal_eval in Python 2.x
                        string = string.encode(self.encoding).decode(charset)
                    self.encoding = charset
            self.messages[id] = string

    def generate(self):
        "Return the generated output."
        # the keys are sorted in the .mo file
        keys = sorted(self.messages.keys())
        offsets = []
        ids = strs = b''
        for id in keys:
            msg = self.messages[id].encode(self.encoding)
            id = id.encode(self.encoding)
            # For each string, we need size and file offset. Each string is
            # NUL terminated; the NUL does not count into the size.
            offsets.append((len(ids), len(id), len(strs),
                            len(msg)))
            ids += id + b'\0'
            strs += msg + b'\0'
        output = b''
        # The header is 7 32-bit unsigned integers. We don't use hash tables,
        # so the keys start right after the index tables.
        keystart = 7 * 4 + 16 * len(keys)
        # and the values start after the keys
        valuestart = keystart + len(ids)
        koffsets = []
        voffsets = []
        # The string table first has the list of keys, then the list of values.
        # Each entry has first the size of the string, then the file offset.
        for o1, l1, o2, l2 in offsets:
            koffsets += [l1, o1 + keystart]
            voffsets += [l2, o2 + valuestart]
        offsets = koffsets + voffsets
        # Even though we don't use a hashtable, we still set its offset to be
        # binary compatible with the gnu gettext format produced by:
        # msgfmt file.po --no-hash
        output = struct.pack("Iiiiiii",
                             0x950412de,        # Magic
                             0,                 # Version
                             len(keys),         # # of entries
                             7 * 4,             # start of key index
                             7 * 4 + len(keys) * 8,  # start of value index
                             0, keystart)       # size and offset of hash table
        if PY3:
            output += array.array("i", offsets).tobytes()
        else:
            output += array.array("i", offsets).tostring()
        output += ids
        output += strs
        return output

    def get(self):
        """ """
        self.read()
        # Compute output
        return self.generate()

    def read(self, header_only=False):
        """ """
        ID = 1
        STR = 2
        CTXT = 3

        section = None
        fuzzy = 0
        msgid = msgstr = msgctxt = u''

        # Parse the catalog
        lno = 0
        for l in self.readPoData():
            l = l.decode(self.encoding)
            lno += 1
            # If we get a comment line after a msgstr or a line starting with
            # msgid or msgctxt, this is a new entry
            if section == STR and (l[0] == '#' or (l[0] == 'm' and
               (l.startswith('msgctxt') or l.startswith('msgid')))):
                self.add(msgctxt, msgid, msgstr, fuzzy)
                section = None
                fuzzy = 0
                # If we only want the header we stop after the first message
                if header_only:
                    break
            # Record a fuzzy mark
            if l[:2] == '#,' and 'fuzzy' in l:
                fuzzy = 1
            # Skip comments
            if l[0] == '#':
                continue
            # Now we are in a msgctxt section
            if l.startswith('msgctxt'):
                section = CTXT
                l = l[7:]
                msgctxt = u''
            # Now we are in a msgid section, output previous section
            elif (l.startswith('msgid') and
                  not l.startswith('msgid_plural')):
                if section == STR:
                    self.add(msgid, msgstr, fuzzy)
                section = ID
                l = l[5:]
                msgid = msgstr = u''
                is_plural = False
            # This is a message with plural forms
            elif l.startswith('msgid_plural'):
                if section != ID:
                    raise PoSyntaxError(
                        'msgid_plural not preceeded by '
                        'msgid on line %d of po file %s' %
                        (lno, repr(self.name)))
                l = l[12:]
                msgid += u'\0'  # separator of singular and plural
                is_plural = True
            # Now we are in a msgstr section
            elif l.startswith('msgstr'):
                section = STR
                if l.startswith('msgstr['):
                    if not is_plural:
                        raise PoSyntaxError(
                            'plural without msgid_plural '
                            'on line %d of po file %s' %
                            (lno, repr(self.name)))
                    l = l.split(']', 1)[1]
                    if msgstr:
                        # Separator of the various plural forms
                        msgstr += u'\0'
                else:
                    if is_plural:
                        raise PoSyntaxError(
                            'indexed msgstr required for '
                            'plural on line %d of po file %s' %
                            (lno, repr(self.name)))
                    l = l[6:]
            # Skip empty lines
            l = l.strip()
            if not l:
                continue
            # TODO: Does this always follow Python escape semantics?
            try:
                l = literal_eval(l)
            except Exception as msg:
                raise PoSyntaxError(
                    '%s (line %d of po file %s): \n%s' %
                    (msg, lno, repr(self.name), l))
            if isinstance(l, bytes):
                l = l.decode(self.encoding)
            if section == CTXT:
                msgctxt += l
            elif section == ID:
                msgid += l
            elif section == STR:
                msgstr += l
            else:
                raise PoSyntaxError(
                    'error on line %d of po file %s' %
                    (lno, repr(self.name)))

        # Add last entry
        if section == STR:
            self.add(msgctxt, msgid, msgstr, fuzzy)

        if self.openfile:
            self.po.close()

    def getAsFile(self):
        return BytesIO(self.get())

Roundup Issue Tracker: http://roundup-tracker.org/