Skip to content

Commit 39d5858

Browse files
committed
Merge remote-tracking branch 'upstream/main' into groupby_cumprod_mask
# Conflicts: # doc/source/whatsnew/v1.6.0.rst
2 parents 36a2edc + 84fa883 commit 39d5858

File tree

38 files changed

+452
-166
lines changed

38 files changed

+452
-166
lines changed

.github/workflows/python-dev.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,10 @@ jobs:
8080
python -m pip install python-dateutil pytz cython hypothesis==6.52.1 pytest>=6.2.5 pytest-xdist pytest-cov pytest-asyncio>=0.17
8181
python -m pip list
8282
83+
# GH 47305: Parallel build can cause flaky ImportError from pandas/_libs/tslibs
8384
- name: Build Pandas
8485
run: |
85-
python setup.py build_ext -q -j2
86+
python setup.py build_ext -q -j1
8687
python -m pip install -e . --no-build-isolation --no-use-pep517
8788
8889
- name: Build Version

asv_bench/benchmarks/array.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ def time_from_float_array(self):
3232

3333
class IntegerArray:
3434
def setup(self):
35-
self.values_integer = np.array([1, 0, 1, 0])
36-
self.data = np.array([1, 2, 3, 4], dtype="int64")
37-
self.mask = np.array([False, False, True, False])
35+
N = 250_000
36+
self.values_integer = np.array([1, 0, 1, 0] * N)
37+
self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
38+
self.mask = np.array([False, False, True, False] * N)
3839

3940
def time_constructor(self):
4041
pd.arrays.IntegerArray(self.data, self.mask)

asv_bench/benchmarks/multiindex_object.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
MultiIndex,
99
RangeIndex,
1010
Series,
11+
array,
1112
date_range,
1213
)
1314

@@ -176,6 +177,20 @@ def time_sortlevel_one(self):
176177
self.mi.sortlevel(1)
177178

178179

180+
class SortValues:
181+
182+
params = ["int64", "Int64"]
183+
param_names = ["dtype"]
184+
185+
def setup(self, dtype):
186+
a = array(np.tile(np.arange(100), 1000), dtype=dtype)
187+
b = array(np.tile(np.arange(1000), 100), dtype=dtype)
188+
self.mi = MultiIndex.from_arrays([a, b])
189+
190+
def time_sort_values(self, dtype):
191+
self.mi.sort_values()
192+
193+
179194
class Values:
180195
def setup_cache(self):
181196

asv_bench/benchmarks/series_methods.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44

55
from pandas import (
6+
NA,
67
Index,
78
NaT,
89
Series,
@@ -166,6 +167,19 @@ def time_value_counts(self, N, dtype):
166167
self.s.value_counts()
167168

168169

170+
class ValueCountsEA:
171+
172+
params = [[10**3, 10**4, 10**5], [True, False]]
173+
param_names = ["N", "dropna"]
174+
175+
def setup(self, N, dropna):
176+
self.s = Series(np.random.randint(0, N, size=10 * N), dtype="Int64")
177+
self.s.loc[1] = NA
178+
179+
def time_value_counts(self, N, dropna):
180+
self.s.value_counts(dropna=dropna)
181+
182+
169183
class ValueCountsObjectDropNAFalse:
170184

171185
params = [10**3, 10**4, 10**5]

doc/source/whatsnew/v1.6.0.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ enhancement2
2828

2929
Other enhancements
3030
^^^^^^^^^^^^^^^^^^
31+
- :meth:`.GroupBy.quantile` now preserving nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
3132
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
33+
- :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
3234
-
3335

3436
.. ---------------------------------------------------------------------------
@@ -125,7 +127,10 @@ Deprecations
125127
Performance improvements
126128
~~~~~~~~~~~~~~~~~~~~~~~~
127129
- Performance improvement in :meth:`.GroupBy.median` and :meth:`.GroupBy.cumprod` for nullable dtypes (:issue:`37493`)
130+
- Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
128131
- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
132+
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
133+
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
129134
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
130135
-
131136

@@ -189,6 +194,7 @@ Missing
189194
MultiIndex
190195
^^^^^^^^^^
191196
- Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`)
197+
- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`)
192198
- Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`)
193199
-
194200

@@ -209,12 +215,12 @@ Plotting
209215

210216
Groupby/resample/rolling
211217
^^^^^^^^^^^^^^^^^^^^^^^^
212-
-
218+
- Bug in :meth:`DataFrameGroupBy.sample` raises ``ValueError`` when the object is empty (:issue:`48459`)
213219
-
214220

215221
Reshaping
216222
^^^^^^^^^
217-
-
223+
- Bug in :func:`join` when ``left_on`` or ``right_on`` is or includes a :class:`CategoricalIndex` incorrectly raising ``AttributeError`` (:issue:`48464`)
218224
-
219225

220226
Sparse

pandas/_libs/groupby.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def group_quantile(
112112
sort_indexer: npt.NDArray[np.intp], # const
113113
qs: npt.NDArray[np.float64], # const
114114
interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
115+
result_mask: np.ndarray | None = ...,
115116
) -> None: ...
116117
def group_last(
117118
out: np.ndarray, # rank_t[:, ::1]

pandas/_libs/groupby.pyx

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1109,6 +1109,7 @@ def group_quantile(
11091109
const intp_t[:] sort_indexer,
11101110
const float64_t[:] qs,
11111111
str interpolation,
1112+
uint8_t[:, ::1] result_mask=None,
11121113
) -> None:
11131114
"""
11141115
Calculate the quantile per group.
@@ -1139,6 +1140,7 @@ def group_quantile(
11391140
InterpolationEnumType interp
11401141
float64_t q_val, q_idx, frac, val, next_val
11411142
int64_t[::1] counts, non_na_counts
1143+
bint uses_result_mask = result_mask is not None
11421144

11431145
assert values.shape[0] == N
11441146

@@ -1181,7 +1183,10 @@ def group_quantile(
11811183

11821184
if non_na_sz == 0:
11831185
for k in range(nqs):
1184-
out[i, k] = NaN
1186+
if uses_result_mask:
1187+
result_mask[i, k] = 1
1188+
else:
1189+
out[i, k] = NaN
11851190
else:
11861191
for k in range(nqs):
11871192
q_val = qs[k]

pandas/_libs/testing.pyx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ cpdef assert_almost_equal(a, b,
9191
Py_ssize_t i, na, nb
9292
double fa, fb
9393
bint is_unequal = False, a_is_ndarray, b_is_ndarray
94+
str first_diff = ''
9495

9596
if lobj is None:
9697
lobj = a
@@ -159,12 +160,14 @@ cpdef assert_almost_equal(a, b,
159160
except AssertionError:
160161
is_unequal = True
161162
diff += 1
163+
if not first_diff:
164+
first_diff = f"At positional index {i}, first diff: {a[i]} != {b[i]}"
162165

163166
if is_unequal:
164167
from pandas._testing import raise_assert_detail
165168
msg = (f"{obj} values are different "
166169
f"({np.round(diff * 100.0 / na, 5)} %)")
167-
raise_assert_detail(obj, msg, lobj, robj, index_values=index_values)
170+
raise_assert_detail(obj, msg, lobj, robj, first_diff=first_diff, index_values=index_values)
168171

169172
return True
170173

pandas/_testing/asserters.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,9 @@ def assert_timedelta_array_equal(
639639
assert_attr_equal("freq", left, right, obj=obj)
640640

641641

642-
def raise_assert_detail(obj, message, left, right, diff=None, index_values=None):
642+
def raise_assert_detail(
643+
obj, message, left, right, diff=None, first_diff=None, index_values=None
644+
):
643645
__tracebackhide__ = True
644646

645647
msg = f"""{obj} are different
@@ -674,6 +676,9 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None)
674676
if diff is not None:
675677
msg += f"\n[diff]: {diff}"
676678

679+
if first_diff is not None:
680+
msg += f"\n{first_diff}"
681+
677682
raise AssertionError(msg)
678683

679684

pandas/conftest.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,6 +1569,31 @@ def any_real_numpy_dtype(request):
15691569
return request.param
15701570

15711571

1572+
@pytest.fixture(
1573+
params=tm.ALL_REAL_NUMPY_DTYPES + tm.ALL_INT_EA_DTYPES + tm.FLOAT_EA_DTYPES
1574+
)
1575+
def any_real_numeric_dtype(request):
1576+
"""
1577+
Parameterized fixture for any (purely) real numeric dtype.
1578+
1579+
* int
1580+
* 'int8'
1581+
* 'uint8'
1582+
* 'int16'
1583+
* 'uint16'
1584+
* 'int32'
1585+
* 'uint32'
1586+
* 'int64'
1587+
* 'uint64'
1588+
* float
1589+
* 'float32'
1590+
* 'float64'
1591+
1592+
and associated ea dtypes.
1593+
"""
1594+
return request.param
1595+
1596+
15721597
@pytest.fixture(params=tm.ALL_NUMPY_DTYPES)
15731598
def any_numpy_dtype(request):
15741599
"""
@@ -1606,6 +1631,45 @@ def any_numpy_dtype(request):
16061631
return request.param
16071632

16081633

1634+
@pytest.fixture(
1635+
params=tm.ALL_REAL_NUMPY_DTYPES
1636+
+ tm.COMPLEX_DTYPES
1637+
+ tm.ALL_INT_EA_DTYPES
1638+
+ tm.FLOAT_EA_DTYPES
1639+
)
1640+
def any_numeric_dtype(request):
1641+
"""
1642+
Parameterized fixture for all numeric dtypes.
1643+
1644+
* int
1645+
* 'int8'
1646+
* 'uint8'
1647+
* 'int16'
1648+
* 'uint16'
1649+
* 'int32'
1650+
* 'uint32'
1651+
* 'int64'
1652+
* 'uint64'
1653+
* float
1654+
* 'float32'
1655+
* 'float64'
1656+
* complex
1657+
* 'complex64'
1658+
* 'complex128'
1659+
* 'UInt8'
1660+
* 'Int8'
1661+
* 'UInt16'
1662+
* 'Int16'
1663+
* 'UInt32'
1664+
* 'Int32'
1665+
* 'UInt64'
1666+
* 'Int64'
1667+
* 'Float32'
1668+
* 'Float64'
1669+
"""
1670+
return request.param
1671+
1672+
16091673
# categoricals are handled separately
16101674
_any_skipna_inferred_dtype = [
16111675
("string", ["a", np.nan, "c"]),

0 commit comments

Comments
 (0)