wip

pandas-dev · jreback · Apr 21, 2021 · Mar 26, 2021 · Mar 26, 2021 · Mar 26, 2021
commit 5c60a1f35888920853a324353c9ccad90cf62e11
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -395,10 +395,8 @@ class GroupByMethods:
     params = [
         ["int", "float", "object", "datetime"],
         [
-
             "cummax",
             "cummin",
-
         ],
         ["direct", "transformation"],
     ]

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1270,14 +1270,10 @@ def group_cummin_max(groupby_t[:, ::1] out,
         Array to store cummin/max in.
     values : array
         Values to take cummin/max of.
-<<<<<<< HEAD
     mask : array[uint8_t]
         If `use_mask`, then indices represent missing values,
         otherwise will be passed as a zeroed array
-    labels : int64 array
-=======
     labels : np.ndarray[np.intp]
->>>>>>> origin/master
         Labels to group by.
     ngroups : int
         Number of groups, larger than all entries of `labels`.
@@ -1320,14 +1316,16 @@ def group_cummin_max(groupby_t[:, ::1] out,
             for j in range(K):
                 val_is_nan = False
 
-                # If using the mask, we can avoid grabbing the
-                # value unless necessary
                 if use_mask:
                     if mask[i, j]:
+
                         # `out` does not need to be set since it
                         # will be masked anyway
                         val_is_nan = True
                     else:
+
+                        # If using the mask, we can avoid grabbing the
+                        # value unless necessary
                         val = values[i, j]
 
                 # Otherwise, `out` must be set accordingly if the
@@ -1359,7 +1357,16 @@ def group_cummin(groupby_t[:, ::1] out,
                  bint is_datetimelike,
                  bint use_mask):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=False)
+    group_cummin_max(
+        out,
+        values,
+        mask,
+        labels,
+        ngroups,
+        is_datetimelike,
+        use_mask,
+        compute_max=False
+    )
 
 
 @cython.boundscheck(False)
@@ -1372,4 +1379,13 @@ def group_cummax(groupby_t[:, ::1] out,
                  bint is_datetimelike,
                  bint use_mask):
     """See group_cummin_max.__doc__"""
-    group_cummin_max(out, values, mask, labels, ngroups, is_datetimelike, use_mask, compute_max=True)
+    group_cummin_max(
+        out,
+        values,
+        mask,
+        labels,
+        ngroups,
+        is_datetimelike,
+        use_mask,
+        compute_max=True
+    )
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -85,7 +85,6 @@ class providing the base-class of operations.
 from pandas.core.arrays import (
     Categorical,
     ExtensionArray,
-    BaseMaskedArray
 )
 from pandas.core.base import (
     DataError,
@@ -105,7 +104,6 @@ class providing the base-class of operations.
     Index,
     MultiIndex,
 )
-from pandas.core.groupby.ops import does_cython_function_use_mask
 from pandas.core.series import Series
 from pandas.core.sorting import get_group_index_sorter
 from pandas.core.util.numba_ import NUMBA_FUNC_CACHE

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -20,9 +20,6 @@
     Tuple,
     Type,
 )
-from pandas.core.arrays.masked import (
-    BaseMaskedDtype,
-)
 
 import numpy as np
 
@@ -40,7 +37,6 @@
     FrameOrSeries,
     Shape,
     final,
-    ArrayLike
 )
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly
@@ -75,10 +71,11 @@
     isna,
     maybe_fill,
 )
-from pandas.core.arrays import (
-    BaseMaskedArray
-)
 
+from pandas.core.arrays.masked import (
+    BaseMaskedArray,
+    BaseMaskedDtype,
+)
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
@@ -123,7 +120,11 @@
         "cummax": "group_cummax",
         "rank": "group_rank",
     },
-    "needs_mask": {"cummin", "cummax"}
+}
+
+_CYTHON_MASKED_FUNCTIONS = {
+    "cummin",
+    "cummax",
 }
 
 
@@ -163,8 +164,8 @@ def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool)
     return func
 
 
-def does_cython_function_use_mask(kind: str) -> bool:
-    return kind in _CYTHON_FUNCTIONS["needs_mask"]
+def cython_function_uses_mask(kind: str) -> bool:
+    return kind in _CYTHON_MASKED_FUNCTIONS
 
 
 class BaseGrouper:
@@ -590,8 +591,14 @@ def _ea_wrap_cython_operation(
 
     @final
     def _masked_ea_wrap_cython_operation(
-        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
-    ) -> ArrayLike:
+        self,
+        kind: str,
+        values: BaseMaskedArray,
+        how: str,
+        axis: int,
+        min_count: int = -1,
+        **kwargs,
+    ) -> BaseMaskedArray:
         """
         Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's
         and cython algorithms which accept a mask.
@@ -601,23 +608,33 @@ def _masked_ea_wrap_cython_operation(
         # isna just directly returns self._mask, so copy here to prevent
         # modifying the original
         mask = isna(values).copy()
-        values = values._data
+        arr = values._data
 
         if is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
             # IntegerArray or BooleanArray
-            values = ensure_int_or_float(values)
+            arr = ensure_int_or_float(arr)
 
         res_values = self._cython_operation(
-            kind, values, how, axis, min_count, mask=mask, **kwargs
+            kind, arr, how, axis, min_count, mask=mask, **kwargs
         )
         dtype = maybe_cast_result_dtype(orig_values.dtype, how)
+        assert isinstance(dtype, BaseMaskedDtype)
         cls = dtype.construct_array_type()
 
-        return cls(res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True))
+        return cls(
+            res_values.astype(dtype.type, copy=False), mask.astype(bool, copy=True)
+        )
 
     @final
     def _cython_operation(
-        self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: Optional[np.ndarray] = None, **kwargs
+        self,
+        kind: str,
+        values,
+        how: str,
+        axis: int,
+        min_count: int = -1,
+        mask: np.ndarray | None = None,
+        **kwargs,
     ) -> ArrayLike:
         """
         Returns the values of a cython operation.
@@ -640,7 +657,7 @@ def _cython_operation(
         self._disallow_invalid_ops(dtype, how, is_numeric)
 
         if is_extension_array_dtype(dtype):
-            if isinstance(dtype, BaseMaskedDtype) and does_cython_function_use_mask(how):
+            if isinstance(values, BaseMaskedArray) and cython_function_uses_mask(how):
                 return self._masked_ea_wrap_cython_operation(
                     kind, values, how, axis, min_count, **kwargs
                 )
@@ -689,7 +706,9 @@ def _cython_operation(
                 )
             out_shape = (self.ngroups,) + values.shape[1:]
 
-        func, values, needs_mask = self._get_cython_func_and_vals(kind, how, values, is_numeric)
+        func, values, needs_mask = self._get_cython_func_and_vals(
+            kind, how, values, is_numeric
+        )
         use_mask = mask is not None
         if needs_mask:
             if mask is None:
@@ -716,10 +735,10 @@ def _cython_operation(
             )
 
         if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike:
-            mask = result == iNaT
-            if mask.any():
+            result_mask = result == iNaT
+            if result_mask.any():
                 result = result.astype("float64")
-                result[mask] = np.nan
+                result[result_mask] = np.nan
 
         if kind == "aggregate" and self._filter_empty_groups and not counts.all():
             assert result.ndim != 2
@@ -755,12 +774,28 @@ def _aggregate(
 
     @final
     def _transform(
-        self, result: np.ndarray, values: np.ndarray, transform_func, is_datetimelike: bool, use_mask: bool, mask: np.ndarray | None, **kwargs
+        self,
+        result: np.ndarray,
+        values: np.ndarray,
+        transform_func,
+        is_datetimelike: bool,
+        use_mask: bool,
+        mask: np.ndarray | None,
+        **kwargs,
     ) -> np.ndarray:
 
         comp_ids, _, ngroups = self.group_info
         if mask is not None:
-            transform_func(result, values, mask, comp_ids, ngroups, is_datetimelike, use_mask, **kwargs)
+            transform_func(
+                result,
+                values,
+                mask,
+                comp_ids,
+                ngroups,
+                is_datetimelike,
+                use_mask,
+                **kwargs,
+            )
         else:
             transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
 

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -39,10 +39,14 @@ def dtypes_for_minmax(request):
         np_type = np.float64
 
     min_val = (
-        np.iinfo(np_type).min if np.dtype(np_type).kind == "i" else np.finfo(np_type).min
+        np.iinfo(np_type).min
+        if np.dtype(np_type).kind == "i"
+        else np.finfo(np_type).min
     )
     max_val = (
-        np.iinfo(np_type).max if np.dtype(np_type).kind == "i" else np.finfo(np_type).max
+        np.iinfo(np_type).max
+        if np.dtype(np_type).kind == "i"
+        else np.finfo(np_type).max
     )
 
     return (dtype, min_val, max_val)
@@ -855,11 +859,11 @@ def test_cummax(dtypes_for_minmax):
 )
 def test_nullable_int_not_cast_as_float(method, dtype, val):
     data = [val, pd.NA]
-    df = pd.DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
+    df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
     grouped = df.groupby("grp")
 
     result = grouped.transform(method)
-    expected = pd.DataFrame({"b": data}, dtype=dtype)
+    expected = DataFrame({"b": data}, dtype=dtype)
 
     tm.assert_frame_equal(result, expected)