-
-
Notifications
You must be signed in to change notification settings - Fork 34k
Description
This is essentially a rebirth of #114682, which was closed due to @skrah ban (unfortunately, such things usually don't fix issues).
The problem seems to be valid and I open a new issue per kindly @gpshead permission. I labeled issue as "type-feature", but IMO it's looks rather as a regression, i.e. "type-bug".
Here benchmarks results (with PGO) for 3.9-3.15:
| Benchmark | 39 | 310 | 311 | 312 | 313 | 314 | 315 |
|---|---|---|---|---|---|---|---|
| decimal_factorial | 804 ms | 843 ms: 1.05x slower | 763 ms: 1.05x faster | 794 ms: 1.01x faster | 1.02 sec: 1.27x slower | 950 ms: 1.18x slower | 951 ms: 1.18x slower |
| decimal_pi | 1.12 sec | 1.26 sec: 1.13x slower | 1.29 sec: 1.15x slower | 1.31 sec: 1.17x slower | 1.86 sec: 1.67x slower | 1.66 sec: 1.48x slower | 1.66 sec: 1.49x slower |
| Geometric mean | (ref) | 1.09x slower | 1.05x slower | 1.07x slower | 1.45x slower | 1.32x slower | 1.33x slower |
Benchmarks were available in the pyperformance package long time ago but are disabled, see python/pyperformance#453 (I use first commit from above PR to enable both benchmarks. Though, the "decimal_factorial" benchmark seems to be less affected, I guess because it has more large numbers).
N.B.: the "decimal_pi" benchmark do computation with low precision (9 and 19). Here is what happens with the default precision (28):
| Benchmark | 39 | 313 |
|---|---|---|
| decimal_pi | 1.20 sec | 1.99 sec: 1.65x slower |
Not much better.
It seems that major slowdown come in the 3.13 (around 1.67x slower for "decimal_pi" on my system) with #106079. Unfortunately, nobody asked to do performance measurements during PR review :(
Something like this happens already on toy extension types:
| Benchmark | ref | heap |
|---|---|---|
| xyz(123) + xyz(321) | 204 ns | 357 ns: 1.75x slower |
code and benchmark
# bench.py
import pyperf
from operator import add
from example import xyz
x, y = map(xyz, [123, 321])
runner = pyperf.Runner()
s = repr(x) + " + " + repr(y)
runner.bench_func(s, add, x, y)/* static type */
#define PY_SSIZE_T_CLEAN
#include <Python.h>
typedef struct {
PyObject_HEAD
long value;
} XYZ_Object;
PyTypeObject XYZ_Type;
#define XYZ_CheckExact(u) Py_IS_TYPE((u), &XYZ_Type)
static XYZ_Object *
XYZ_new(long value)
{
XYZ_Object *res = PyObject_New(XYZ_Object, &XYZ_Type);
if (res) {
res->value = value;
}
return res;
}
static PyObject *
new(PyTypeObject *type, PyObject *args, PyObject *keywds)
{
Py_ssize_t argc = PyTuple_GET_SIZE(args);
if (argc == 1) {
PyObject *arg = PyTuple_GET_ITEM(args, 0);
long value = PyLong_AsLong(arg);
if (value == -1 && PyErr_Occurred()) {
return NULL;
}
return (PyObject *)XYZ_new(value);
}
PyErr_SetString(PyExc_TypeError, "value required");
return NULL;
}
static PyObject *
add(PyObject *self, PyObject *other)
{
XYZ_Object *x = (XYZ_Object *)self;
XYZ_Object *y = (XYZ_Object *)other;
if (XYZ_CheckExact(x) && XYZ_CheckExact(y)) {
return (PyObject *)XYZ_new(x->value + y->value);
}
Py_RETURN_NOTIMPLEMENTED;
}
static PyObject *
repr(PyObject *self)
{
return PyUnicode_FromFormat("xyz(%ld)", ((XYZ_Object *)self)->value);
}
static PyNumberMethods xyz_as_number = {
.nb_add = add,
};
PyTypeObject XYZ_Type = {
PyVarObject_HEAD_INIT(NULL, 0)
.tp_name = "xyz",
.tp_basicsize = sizeof(XYZ_Object),
.tp_new = new,
.tp_repr = repr,
.tp_as_number = &xyz_as_number,
.tp_flags = Py_TPFLAGS_DEFAULT,
};
static int
example_exec(PyObject *module)
{
if (PyModule_AddType(module, &XYZ_Type) < 0) {
return -1;
}
return 0;
}
#ifdef __GNUC__
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wpedantic"
#endif
static PyModuleDef_Slot example_slots[] = {
{Py_mod_exec, example_exec},
{0, NULL}};
#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
static struct PyModuleDef example_module = {
PyModuleDef_HEAD_INIT,
.m_name = "example",
.m_doc = "Test module.",
.m_size = 0,
.m_slots = example_slots,
};
PyMODINIT_FUNC
PyInit_example(void)
{
return PyModuleDef_Init(&example_module);
}/* heap type */
#define PY_SSIZE_T_CLEAN
#include <Python.h>
typedef struct {
PyObject_HEAD
long value;
} XYZ_Object;
typedef struct {
PyTypeObject *XYZ_Type;
} example_state;
#define XYZ_CheckExact(st, u) Py_IS_TYPE((u), (st)->XYZ_Type)
static XYZ_Object *
XYZ_new(example_state *Py_UNUSED(state), PyTypeObject *type, long value)
{
XYZ_Object *res = PyObject_GC_New(XYZ_Object, type);
if (res) {
PyObject_GC_Track((PyObject *)res);
res->value = value;
}
return res;
}
static struct PyModuleDef example_module;
static example_state *
get_state(PyTypeObject *type)
{
PyObject *module = PyType_GetModuleByDef(type, &example_module);
return PyModule_GetState(module);
}
static PyObject *
new(PyTypeObject *type, PyObject *args, PyObject *keywds)
{
Py_ssize_t argc = PyTuple_GET_SIZE(args);
example_state *state = get_state(type);
if (argc == 1) {
PyObject *arg = PyTuple_GET_ITEM(args, 0);
long value = PyLong_AsLong(arg);
if (value == -1 && PyErr_Occurred()) {
return NULL;
}
return (PyObject *)XYZ_new(state, type, value);
}
PyErr_SetString(PyExc_TypeError, "value required");
return NULL;
}
static int
traverse(PyObject *self, visitproc visit, void *arg)
{
Py_VISIT(Py_TYPE(self));
return 0;
}
static void
dealloc(PyObject *self)
{
PyTypeObject *type = Py_TYPE(self);
PyObject_GC_UnTrack(self);
type->tp_free(self);
Py_DECREF(type);
}
static inline example_state *
find_state_left_or_right(PyObject *left, PyObject *right)
{
PyObject *module = PyType_GetModuleByDef(Py_TYPE(left), &example_module);
if (module) {
return PyModule_GetState(module);
}
PyErr_Clear();
return PyType_GetModuleState(Py_TYPE(right));
}
static PyObject *
add(PyObject *self, PyObject *other)
{
example_state *state = find_state_left_or_right(self, other);
XYZ_Object *x = (XYZ_Object *)self;
XYZ_Object *y = (XYZ_Object *)other;
if (XYZ_CheckExact(state, x) && XYZ_CheckExact(state, y)) {
return (PyObject *)XYZ_new(state, state->XYZ_Type, x->value + y->value);
}
Py_RETURN_NOTIMPLEMENTED;
}
static PyObject *
repr(PyObject *self)
{
return PyUnicode_FromFormat("xyz(%ld)", ((XYZ_Object *)self)->value);
}
#ifdef __GNUC__
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wpedantic"
#endif
static PyType_Slot xyz_slots[] = {
{Py_tp_repr, repr},
{Py_tp_new, new},
{Py_nb_add, add},
{Py_tp_traverse, traverse},
{Py_tp_dealloc, dealloc},
};
#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
static PyType_Spec xyz_spec = {
.name = "xyz",
.basicsize = sizeof(XYZ_Object),
.flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE,
.slots = xyz_slots,
};
static int
example_exec(PyObject *module)
{
example_state *state = PyModule_GetState(module);
state->XYZ_Type = (PyTypeObject *)PyType_FromModuleAndSpec(module,
&xyz_spec,
NULL);
if (!state->XYZ_Type || PyModule_AddType(module, state->XYZ_Type) < 0) {
return -1;
}
return 0;
}
static int
example_clear(PyObject *module)
{
example_state *state = PyModule_GetState(module);
Py_CLEAR(state->XYZ_Type);
return 0;
}
static int
example_traverse(PyObject *module, visitproc visit, void *arg)
{
example_state *state = PyModule_GetState(module);
Py_VISIT(state->XYZ_Type);
return 0;
}
#ifdef __GNUC__
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wpedantic"
#endif
static PyModuleDef_Slot example_slots[] = {
{Py_mod_exec, example_exec},
{0, NULL}};
#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
static struct PyModuleDef example_module = {
PyModuleDef_HEAD_INIT,
.m_name = "example",
.m_doc = "Test module.",
.m_size = 0,
.m_slots = example_slots,
.m_clear = example_clear,
.m_traverse = example_traverse,
};
PyMODINIT_FUNC
PyInit_example(void)
{
return PyModuleDef_Init(&example_module);
}So, what we could do? Can heap types (at least immutable) be less costly c.f. static types? Can we convert the decimal module back to static types?
I tried to add a freelist for Decimal's (quick patch is there: skirpichev#16). It looks this will mitigate the problem, but not entirely fix regression.
Metadata
Metadata
Assignees
Labels
Projects
Status