-
-
Notifications
You must be signed in to change notification settings - Fork 19.4k
PERF/BUG: use masked algo in groupby cummin and cummax #40651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
7cd4dc3
91984dc
b371cc5
69cce96
dd7f324
64680d4
be16f65
9442846
f089175
31409f8
0c05f74
18dcc94
5c60a1f
f0c27ce
2fa80ad
280c7e5
dca28cf
7e2fbe0
0ebb97a
0009dfd
6663832
8247f82
71e1c4f
c6cf9ee
02768ec
836175b
293dc6e
478c6c9
fa45a9a
1632b81
1bb344e
97d9eea
892a92a
a239a68
f98ca35
e7ed12f
a1422ba
482a209
4e7404d
251c02a
3de7e5e
237f86f
a1b0c04
5e1dac4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,9 +20,6 @@ | |
| Tuple, | ||
| Type, | ||
| ) | ||
| from pandas.core.arrays.masked import ( | ||
| BaseMaskedDtype, | ||
| ) | ||
|
|
||
| import numpy as np | ||
|
|
||
|
|
@@ -40,7 +37,6 @@ | |
| FrameOrSeries, | ||
| Shape, | ||
| final, | ||
| ArrayLike | ||
| ) | ||
| from pandas.errors import AbstractMethodError | ||
| from pandas.util._decorators import cache_readonly | ||
|
|
@@ -75,10 +71,11 @@ | |
| isna, | ||
| maybe_fill, | ||
| ) | ||
| from pandas.core.arrays import ( | ||
| BaseMaskedArray | ||
| ) | ||
|
|
||
| from pandas.core.arrays.masked import ( | ||
| BaseMaskedArray, | ||
| BaseMaskedDtype, | ||
| ) | ||
| from pandas.core.base import SelectionMixin | ||
| import pandas.core.common as com | ||
| from pandas.core.frame import DataFrame | ||
|
|
@@ -123,7 +120,11 @@ | |
| "cummax": "group_cummax", | ||
| "rank": "group_rank", | ||
| }, | ||
| "needs_mask": {"cummin", "cummax"} | ||
| } | ||
|
|
||
| _CYTHON_MASKED_FUNCTIONS = { | ||
| "cummin", | ||
| "cummax", | ||
| } | ||
|
|
||
|
|
||
|
|
@@ -163,8 +164,8 @@ def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool) | |
| return func | ||
|
|
||
|
|
||
| def does_cython_function_use_mask(kind: str) -> bool: | ||
| return kind in _CYTHON_FUNCTIONS["needs_mask"] | ||
| def cython_function_uses_mask(kind: str) -> bool: | ||
| return kind in _CYTHON_MASKED_FUNCTIONS | ||
|
|
||
|
|
||
| class BaseGrouper: | ||
|
|
@@ -590,8 +591,14 @@ def _ea_wrap_cython_operation( | |
|
|
||
| @final | ||
| def _masked_ea_wrap_cython_operation( | ||
| self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs | ||
| ) -> ArrayLike: | ||
| self, | ||
| kind: str, | ||
| values: BaseMaskedArray, | ||
| how: str, | ||
| axis: int, | ||
| min_count: int = -1, | ||
| **kwargs, | ||
| ) -> BaseMaskedArray: | ||
| """ | ||
| Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's | ||
| and cython algorithms which accept a mask. | ||
|
|
@@ -601,23 +608,33 @@ def _masked_ea_wrap_cython_operation( | |
| # isna just directly returns self._mask, so copy here to prevent | ||
| # modifying the original | ||
| mask = isna(values).copy() | ||
| values = values._data | ||
| arr = values._data | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): | ||
|
||
| # IntegerArray or BooleanArray | ||
| values = ensure_int_or_float(values) | ||
| arr = ensure_int_or_float(arr) | ||
|
||
|
|
||
| res_values = self._cython_operation( | ||
| kind, values, how, axis, min_count, mask=mask, **kwargs | ||
| kind, arr, how, axis, min_count, mask=mask, **kwargs | ||
| ) | ||
| dtype = maybe_cast_result_dtype(orig_values.dtype, how) | ||
| assert isinstance(dtype, BaseMaskedDtype) | ||
| cls = dtype.construct_array_type() | ||
|
|
||
| return cls(res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True)) | ||
| return cls( | ||
| res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True) | ||
| ) | ||
|
|
||
| @final | ||
| def _cython_operation( | ||
| self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs | ||
| self, | ||
| kind: str, | ||
| values, | ||
| how: str, | ||
| axis: int, | ||
| min_count: int = -1, | ||
| mask: np.ndarray | None = None, | ||
| **kwargs, | ||
| ) -> ArrayLike: | ||
| """ | ||
| Returns the values of a cython operation. | ||
|
|
@@ -640,7 +657,7 @@ def _cython_operation( | |
| self._disallow_invalid_ops(dtype, how, is_numeric) | ||
|
|
||
| if is_extension_array_dtype(dtype): | ||
| if isinstance(dtype, BaseMaskedDtype) and does_cython_function_use_mask(how): | ||
| if isinstance(values, BaseMaskedArray) and cython_function_uses_mask(how): | ||
| return self._masked_ea_wrap_cython_operation( | ||
| kind, values, how, axis, min_count, **kwargs | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i really don't understand all of this code duplication. this is adding huge complexity. pls reduce it.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Jeff, did you actually read the previous responses to your similar comment? (https://github.com/pandas-dev/pandas/pull/40651/files#r603319910) Can you then please answer there to the concrete reasons given.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes and its a terrible pattern.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this duplication of code is ridiculous. We have a VERY large codebase. Having this kind of separate logic is amazingling confusing & is humungous tech debt. This is heavily used code and needs to be carefully modified.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand the concern about adding code complexity - my thinking was that if the goal is for nullable types to become the default in pandas, then direct support makes sense. And in that case, nullable types would need to be special-cased somewhere, and I think the separate function is cleaner than interleaving in If direct support for nullable dtypes is not desired, we can just close this. If it is, I'll keep trying to think of ways to achieve this without adding more code, but any suggestions there would be welcome!
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Proper support for nullable dtypes is certainly desired (how to add it exactly can of course be discussed), so thanks a lot @mzeitlin11 for your efforts here. AFAIK, it's correct we need some special casing for it somewhere (that's the whole point of this PR is to add special handling for it). @jreback please try to stay constructive (eg answer to our arguments or provide concrete suggestions on where you would put it / how you would do it differently) and please mind your language (there is no need to call the approach taken by a contributor "terrible").
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This decision has not been made.
Agreed.
as Joris correctly pointed out, that is not viable ATM. I think a lot of this dispatch logic eventually belongs in WrappedCythonOp (which i've been vaguely planning on doing next time there aren't any open PRs touching this code), at which point we can reconsider flattening this
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jbrockmendel if you plan further refactoring of this code, I'm happy to just mothball this pr for now. The real benefit won't come in until more groupby algos allow a mask on this path anyway, so not worth adding now if it's just going to cause more pain in future refactoring. I also like the idea of approach 5 instead of this - could start looking into that if you think it's a promising direction.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
From today's call, I think the plan is to move forward with this first.
Long-term I think this is the right way to go to get the general case right, so I'd encourage you if you're interested in trying to implement this on the EA- separate PR(s). |
||
| ) | ||
|
|
@@ -689,7 +706,9 @@ def _cython_operation( | |
| ) | ||
| out_shape = (self.ngroups,) + values.shape[1:] | ||
|
|
||
| func, values, needs_mask = self._get_cython_func_and_vals(kind, how, values, is_numeric) | ||
| func, values, needs_mask = self._get_cython_func_and_vals( | ||
| kind, how, values, is_numeric | ||
| ) | ||
| use_mask = mask is not None | ||
| if needs_mask: | ||
| if mask is None: | ||
|
|
@@ -716,10 +735,10 @@ def _cython_operation( | |
| ) | ||
|
|
||
| if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike: | ||
| mask = result == iNaT | ||
| if mask.any(): | ||
| result_mask = result == iNaT | ||
| if result_mask.any(): | ||
| result = result.astype("float64") | ||
| result[mask] = np.nan | ||
| result[result_mask] = np.nan | ||
|
|
||
| if kind == "aggregate" and self._filter_empty_groups and not counts.all(): | ||
| assert result.ndim != 2 | ||
|
|
@@ -755,12 +774,28 @@ def _aggregate( | |
|
|
||
| @final | ||
| def _transform( | ||
| self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, use_mask: bool, mask: np.ndarray | None, **kwargs | ||
| self, | ||
| result: np.ndarray, | ||
| values: np.ndarray, | ||
| transform_func, | ||
| is_datetimelike: bool, | ||
| use_mask: bool, | ||
| mask: np.ndarray | None, | ||
| **kwargs, | ||
| ) -> np.ndarray: | ||
|
|
||
| comp_ids, _, ngroups = self.group_info | ||
| if mask is not None: | ||
| transform_func(result, values, mask, comp_ids, ngroups, is_datetimelike, use_mask, **kwargs) | ||
| transform_func( | ||
| result, | ||
| values, | ||
| mask, | ||
| comp_ids, | ||
| ngroups, | ||
| is_datetimelike, | ||
| use_mask, | ||
| **kwargs, | ||
| ) | ||
| else: | ||
| transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.