Skip to content

Speed regression in the decimal module due to using heap types #144650

@skirpichev

Description

@skirpichev

This is essentially a rebirth of #114682, which was closed due to @skrah ban (unfortunately, such things usually don't fix issues).

The problem seems to be valid and I open a new issue per kindly @gpshead permission. I labeled issue as "type-feature", but IMO it's looks rather as a regression, i.e. "type-bug".

Here benchmarks results (with PGO) for 3.9-3.15:

Benchmark 39 310 311 312 313 314 315
decimal_factorial 804 ms 843 ms: 1.05x slower 763 ms: 1.05x faster 794 ms: 1.01x faster 1.02 sec: 1.27x slower 950 ms: 1.18x slower 951 ms: 1.18x slower
decimal_pi 1.12 sec 1.26 sec: 1.13x slower 1.29 sec: 1.15x slower 1.31 sec: 1.17x slower 1.86 sec: 1.67x slower 1.66 sec: 1.48x slower 1.66 sec: 1.49x slower
Geometric mean (ref) 1.09x slower 1.05x slower 1.07x slower 1.45x slower 1.32x slower 1.33x slower

Benchmarks were available in the pyperformance package long time ago but are disabled, see python/pyperformance#453 (I use first commit from above PR to enable both benchmarks. Though, the "decimal_factorial" benchmark seems to be less affected, I guess because it has more large numbers).

N.B.: the "decimal_pi" benchmark do computation with low precision (9 and 19). Here is what happens with the default precision (28):

Benchmark 39 313
decimal_pi 1.20 sec 1.99 sec: 1.65x slower

Not much better.

It seems that major slowdown come in the 3.13 (around 1.67x slower for "decimal_pi" on my system) with #106079. Unfortunately, nobody asked to do performance measurements during PR review :(

Something like this happens already on toy extension types:

Benchmark ref heap
xyz(123) + xyz(321) 204 ns 357 ns: 1.75x slower
code and benchmark
# bench.py
import pyperf
from operator import add
from example import xyz

x, y = map(xyz, [123, 321])
runner = pyperf.Runner()
s = repr(x) + " + " + repr(y)
runner.bench_func(s, add, x, y)
/* static type */

#define PY_SSIZE_T_CLEAN
#include <Python.h>

typedef struct {
    PyObject_HEAD
    long value;
} XYZ_Object;

PyTypeObject XYZ_Type;
#define XYZ_CheckExact(u) Py_IS_TYPE((u), &XYZ_Type)

static XYZ_Object *
XYZ_new(long value)
{
    XYZ_Object *res = PyObject_New(XYZ_Object, &XYZ_Type);

    if (res) {
        res->value = value;
    }
    return res;
}

static PyObject *
new(PyTypeObject *type, PyObject *args, PyObject *keywds)
{
    Py_ssize_t argc = PyTuple_GET_SIZE(args);

    if (argc == 1) {
        PyObject *arg = PyTuple_GET_ITEM(args, 0);
        long value = PyLong_AsLong(arg);

        if (value == -1 && PyErr_Occurred()) {
            return NULL;
        }
        return (PyObject *)XYZ_new(value);
    }
    PyErr_SetString(PyExc_TypeError, "value required");
    return NULL;
}

static PyObject *
add(PyObject *self, PyObject *other)
{
    XYZ_Object *x = (XYZ_Object *)self;
    XYZ_Object *y = (XYZ_Object *)other;

    if (XYZ_CheckExact(x) && XYZ_CheckExact(y)) {
        return (PyObject *)XYZ_new(x->value + y->value);
    }
    Py_RETURN_NOTIMPLEMENTED;
}

static PyObject *
repr(PyObject *self)
{
    return PyUnicode_FromFormat("xyz(%ld)", ((XYZ_Object *)self)->value);
}

static PyNumberMethods xyz_as_number = {
    .nb_add = add,
};

PyTypeObject XYZ_Type = {
    PyVarObject_HEAD_INIT(NULL, 0)
    .tp_name = "xyz",
    .tp_basicsize = sizeof(XYZ_Object),
    .tp_new = new,
    .tp_repr = repr,
    .tp_as_number = &xyz_as_number,
    .tp_flags = Py_TPFLAGS_DEFAULT,
};

static int
example_exec(PyObject *module)
{
    if (PyModule_AddType(module, &XYZ_Type) < 0) {
        return -1;
    }
    return 0;
}

#ifdef __GNUC__
#  pragma GCC diagnostic push
#  pragma GCC diagnostic ignored "-Wpedantic"
#endif
static PyModuleDef_Slot example_slots[] = {
    {Py_mod_exec, example_exec},
    {0, NULL}};
#ifdef __GNUC__
#  pragma GCC diagnostic pop
#endif

static struct PyModuleDef example_module = {
    PyModuleDef_HEAD_INIT,
    .m_name = "example",
    .m_doc = "Test module.",
    .m_size = 0,
    .m_slots = example_slots,
};

PyMODINIT_FUNC
PyInit_example(void)
{
    return PyModuleDef_Init(&example_module);
}
/* heap type */

#define PY_SSIZE_T_CLEAN
#include <Python.h>

typedef struct {
    PyObject_HEAD
    long value;
} XYZ_Object;

typedef struct {
    PyTypeObject *XYZ_Type;
} example_state;

#define XYZ_CheckExact(st, u) Py_IS_TYPE((u), (st)->XYZ_Type)

static XYZ_Object *
XYZ_new(example_state *Py_UNUSED(state), PyTypeObject *type, long value)
{
    XYZ_Object *res = PyObject_GC_New(XYZ_Object, type);

    if (res) {
        PyObject_GC_Track((PyObject *)res);
        res->value = value;
    }
    return res;
}

static struct PyModuleDef example_module;

static example_state *
get_state(PyTypeObject *type)
{
    PyObject *module = PyType_GetModuleByDef(type, &example_module);

    return PyModule_GetState(module);
}

static PyObject *
new(PyTypeObject *type, PyObject *args, PyObject *keywds)
{
    Py_ssize_t argc = PyTuple_GET_SIZE(args);
    example_state *state = get_state(type);

    if (argc == 1) {
        PyObject *arg = PyTuple_GET_ITEM(args, 0);
        long value = PyLong_AsLong(arg);

        if (value == -1 && PyErr_Occurred()) {
            return NULL;
        }
        return (PyObject *)XYZ_new(state, type, value);
    }
    PyErr_SetString(PyExc_TypeError, "value required");
    return NULL;
}

static int
traverse(PyObject *self, visitproc visit, void *arg)
{
    Py_VISIT(Py_TYPE(self));
    return 0;
}

static void
dealloc(PyObject *self)
{
    PyTypeObject *type = Py_TYPE(self);

    PyObject_GC_UnTrack(self);
    type->tp_free(self);
    Py_DECREF(type);
}

static inline example_state *
find_state_left_or_right(PyObject *left, PyObject *right)
{
    PyObject *module = PyType_GetModuleByDef(Py_TYPE(left), &example_module);

    if (module) {
        return PyModule_GetState(module);
    }
    PyErr_Clear();
    return PyType_GetModuleState(Py_TYPE(right));
}

static PyObject *
add(PyObject *self, PyObject *other)
{
    example_state *state = find_state_left_or_right(self, other);
    XYZ_Object *x = (XYZ_Object *)self;
    XYZ_Object *y = (XYZ_Object *)other;

    if (XYZ_CheckExact(state, x) && XYZ_CheckExact(state, y)) {
        return (PyObject *)XYZ_new(state, state->XYZ_Type, x->value + y->value);
    }
    Py_RETURN_NOTIMPLEMENTED;
}

static PyObject *
repr(PyObject *self)
{
    return PyUnicode_FromFormat("xyz(%ld)", ((XYZ_Object *)self)->value);
}

#ifdef __GNUC__
#  pragma GCC diagnostic push
#  pragma GCC diagnostic ignored "-Wpedantic"
#endif
static PyType_Slot xyz_slots[] = {
    {Py_tp_repr, repr},
    {Py_tp_new, new},
    {Py_nb_add, add},
    {Py_tp_traverse, traverse},
    {Py_tp_dealloc, dealloc},
};
#ifdef __GNUC__
#  pragma GCC diagnostic pop
#endif

static PyType_Spec xyz_spec = {
    .name = "xyz",
    .basicsize = sizeof(XYZ_Object),
    .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE,
    .slots = xyz_slots,
};

static int
example_exec(PyObject *module)
{
    example_state *state = PyModule_GetState(module);

    state->XYZ_Type = (PyTypeObject *)PyType_FromModuleAndSpec(module,
                                                               &xyz_spec,
                                                               NULL);
    if (!state->XYZ_Type || PyModule_AddType(module, state->XYZ_Type) < 0) {
        return -1;
    }
    return 0;
}

static int
example_clear(PyObject *module)
{
    example_state *state = PyModule_GetState(module);

    Py_CLEAR(state->XYZ_Type);
    return 0;
}

static int
example_traverse(PyObject *module, visitproc visit, void *arg)
{
    example_state *state = PyModule_GetState(module);

    Py_VISIT(state->XYZ_Type);
    return 0;
}

#ifdef __GNUC__
#  pragma GCC diagnostic push
#  pragma GCC diagnostic ignored "-Wpedantic"
#endif
static PyModuleDef_Slot example_slots[] = {
    {Py_mod_exec, example_exec},
    {0, NULL}};
#ifdef __GNUC__
#  pragma GCC diagnostic pop
#endif

static struct PyModuleDef example_module = {
    PyModuleDef_HEAD_INIT,
    .m_name = "example",
    .m_doc = "Test module.",
    .m_size = 0,
    .m_slots = example_slots,
    .m_clear = example_clear,
    .m_traverse = example_traverse,
};

PyMODINIT_FUNC
PyInit_example(void)
{
    return PyModuleDef_Init(&example_module);
}

So, what we could do? Can heap types (at least immutable) be less costly c.f. static types? Can we convert the decimal module back to static types?

I tried to add a freelist for Decimal's (quick patch is there: skirpichev#16). It looks this will mitigate the problem, but not entirely fix regression.

Metadata

Metadata

Assignees

No one assigned

    Projects

    Status

    Todo

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions