Skip to content

Commit f769f6b

Browse files
ARROW-18173: [Python] Drop older versions of Pandas (<1.0) (apache#14631)
This PR tries to make changes to drop older versions of pandas and support versions >= 1.0.0. The changes will have to be done in: - [x] the official documentation (pandas version support) - [x] the CI jobs supporting older pandas versions - [x] https://github.com/apache/arrow/blob/master/python/pyarrow/pandas-shim.pxi - [x] tests that are specifically testing features on older versions of pandas Lead-authored-by: Alenka Frim <frim.alenka@gmail.com> Co-authored-by: Alenka Frim <AlenkaF@users.noreply.github.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
1 parent 3cc982e commit f769f6b

10 files changed

Lines changed: 92 additions & 177 deletions

File tree

.github/workflows/python.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jobs:
5454
name:
5555
- conda-python-docs
5656
- conda-python-3.8-nopandas
57-
- conda-python-3.7-pandas-0.23
57+
- conda-python-3.7-pandas-1.0
5858
- conda-python-3.9-pandas-latest
5959
include:
6060
- name: conda-python-docs
@@ -67,12 +67,12 @@ jobs:
6767
image: conda-python
6868
title: AMD64 Conda Python 3.8 Without Pandas
6969
python: 3.8
70-
- name: conda-python-3.7-pandas-0.23
70+
- name: conda-python-3.7-pandas-1.0
7171
cache: conda-python-3.7
7272
image: conda-python-pandas
73-
title: AMD64 Conda Python 3.7 Pandas 0.23
73+
title: AMD64 Conda Python 3.7 Pandas 1.0
7474
python: 3.7
75-
pandas: 0.23
75+
pandas: 1.0
7676
numpy: 1.16
7777
- name: conda-python-3.9-pandas-latest
7878
cache: conda-python-3.9

docs/source/python/install.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,18 @@ Installing from source
6161
----------------------
6262

6363
See :ref:`python-development`.
64+
65+
Dependencies
66+
------------
67+
68+
Required dependency
69+
70+
* **NumPy 1.16.6** or higher.
71+
72+
Optional dependencies
73+
74+
* **pandas 1.0** or higher,
75+
* **cffi**.
76+
77+
Additional packages PyArrow is compatible with are :ref:`fsspec <filesystem-fsspec>`
78+
and **pytz**, **dateutil** or **tzdata** package for timezones.

python/pyarrow/feather.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,6 @@
2424
import pyarrow.lib as ext
2525
from pyarrow import _feather
2626
from pyarrow._feather import FeatherError # noqa: F401
27-
from pyarrow.vendored.version import Version
28-
29-
30-
def _check_pandas_version():
31-
if _pandas_api.loose_version < Version('0.17.0'):
32-
raise ImportError("feather requires pandas >= 0.17.0")
3327

3428

3529
class FeatherDataset:
@@ -96,7 +90,6 @@ def read_pandas(self, columns=None, use_threads=True):
9690
pandas.DataFrame
9791
Content of the file as a pandas DataFrame (of columns)
9892
"""
99-
_check_pandas_version()
10093
return self.read_table(columns=columns).to_pandas(
10194
use_threads=use_threads)
10295

@@ -145,7 +138,6 @@ def write_feather(df, dest, compression=None, compression_level=None,
145138
limited legacy format
146139
"""
147140
if _pandas_api.have_pandas:
148-
_check_pandas_version()
149141
if (_pandas_api.has_sparse and
150142
isinstance(df, _pandas_api.pd.SparseDataFrame)):
151143
df = df.to_dense()
@@ -230,7 +222,6 @@ def read_feather(source, columns=None, use_threads=True,
230222
-------
231223
df : pandas.DataFrame
232224
"""
233-
_check_pandas_version()
234225
return (read_table(
235226
source, columns=columns, memory_map=memory_map,
236227
use_threads=use_threads).to_pandas(use_threads=use_threads, **kwargs))

python/pyarrow/pandas-shim.pxi

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,16 @@ cdef class _PandasAPIShim(object):
5959
self._version = pd.__version__
6060
self._loose_version = Version(pd.__version__)
6161

62-
if self._loose_version < Version('0.23.0'):
62+
if self._loose_version < Version('1.0.0'):
6363
self._have_pandas = False
6464
if raise_:
6565
raise ImportError(
66-
"pyarrow requires pandas 0.23.0 or above, pandas {} is "
66+
"pyarrow requires pandas 1.0.0 or above, pandas {} is "
6767
"installed".format(self._version)
6868
)
6969
else:
7070
warnings.warn(
71-
"pyarrow requires pandas 0.23.0 or above, pandas {} is "
71+
"pyarrow requires pandas 1.0.0 or above, pandas {} is "
7272
"installed. Therefore, pandas-specific integration is not "
7373
"used.".format(self._version), stacklevel=2)
7474
return
@@ -83,22 +83,12 @@ cdef class _PandasAPIShim(object):
8383
self._series, self._index, self._categorical_type,
8484
self._extension_array)
8585
self._extension_dtype = pd.api.extensions.ExtensionDtype
86-
if self._loose_version >= Version('0.24.0'):
87-
self._is_extension_array_dtype = \
88-
pd.api.types.is_extension_array_dtype
89-
else:
90-
self._is_extension_array_dtype = None
91-
86+
self._is_extension_array_dtype = (
87+
pd.api.types.is_extension_array_dtype)
9288
self._types_api = pd.api.types
9389
self._datetimetz_type = pd.api.types.DatetimeTZDtype
9490
self._have_pandas = True
95-
96-
if self._loose_version > Version('0.25'):
97-
self.has_sparse = False
98-
else:
99-
self.has_sparse = True
100-
101-
self._pd024 = self._loose_version >= Version('0.24')
91+
self.has_sparse = False
10292

10393
cdef inline _check_import(self, bint raise_=True):
10494
if self._tried_importing_pandas:
@@ -232,10 +222,7 @@ cdef class _PandasAPIShim(object):
232222
self._check_import()
233223
if isinstance(obj.dtype, (self.pd.api.types.IntervalDtype,
234224
self.pd.api.types.PeriodDtype)):
235-
if self._pd024:
236-
# only since pandas 0.24, interval and period are stored as
237-
# such in Series
238-
return obj.array
225+
return obj.array
239226
return obj.values
240227

241228
def assert_frame_equal(self, *args, **kwargs):

python/pyarrow/pandas_compat.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,9 +1089,8 @@ def _pandas_type_to_numpy_type(pandas_type):
10891089

10901090

10911091
def _get_multiindex_codes(mi):
1092-
# compat for pandas < 0.24 (MI labels renamed to codes).
10931092
if isinstance(mi, _pandas_api.pd.MultiIndex):
1094-
return mi.codes if hasattr(mi, 'codes') else mi.labels
1093+
return mi.codes
10951094
else:
10961095
return None
10971096

python/pyarrow/tests/parquet/test_dataset.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -250,13 +250,11 @@ def test_filters_equivalency(tempdir, use_legacy_dataset):
250250
result_df = table.to_pandas().reset_index(drop=True)
251251

252252
# Check that all rows in the DF fulfill the filter
253-
# Pandas 0.23.x has problems with indexing constant memoryviews in
254-
# categoricals. Thus we need to make an explicit copy here with np.array.
255-
df_filter_1 = (np.array(result_df['integer']) == 1) \
256-
& (np.array(result_df['string']) != 'b') \
257-
& (np.array(result_df['boolean']) == 'True')
253+
df_filter_1 = (result_df['integer'] == 1) \
254+
& (result_df['string'] != 'b') \
255+
& (result_df['boolean'] == 'True')
258256
df_filter_2 = (np.array(result_df['integer']) == 0) \
259-
& (np.array(result_df['boolean']) == 'False')
257+
& (result_df['boolean'] == 'False')
260258
assert df_filter_1.sum() > 0
261259
assert df_filter_2.sum() > 0
262260
assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum())

python/pyarrow/tests/parquet/test_pandas.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from pyarrow.tests.parquet.common import (
2727
parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
2828
from pyarrow.util import guid
29-
from pyarrow.vendored.version import Version
3029

3130
try:
3231
import pyarrow.parquet as pq
@@ -561,10 +560,6 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset):
561560
def test_write_to_dataset_pandas_preserve_extensiondtypes(
562561
tempdir, use_legacy_dataset
563562
):
564-
# ARROW-8251 - preserve pandas extension dtypes in roundtrip
565-
if Version(pd.__version__) < Version("1.0.0"):
566-
pytest.skip("__arrow_array__ added to pandas in 1.0.0")
567-
568563
df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
569564
df['col'] = df['col'].astype("Int64")
570565
table = pa.table(df)

python/pyarrow/tests/test_compute.py

Lines changed: 31 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,14 +1812,6 @@ def test_strptime():
18121812
@pytest.mark.skipif(sys.platform == 'win32',
18131813
reason="Timezone database is not available on Windows yet")
18141814
def test_strftime():
1815-
from pyarrow.vendored.version import Version
1816-
1817-
def _fix_timestamp(s):
1818-
if Version(pd.__version__) < Version("1.0.0"):
1819-
return s.to_series().replace("NaT", pd.NaT)
1820-
else:
1821-
return s
1822-
18231815
times = ["2018-03-10 09:00", "2038-01-31 12:23", None]
18241816
timezones = ["CET", "UTC", "Europe/Ljubljana"]
18251817

@@ -1834,50 +1826,51 @@ def _fix_timestamp(s):
18341826
for fmt in formats:
18351827
options = pc.StrftimeOptions(fmt)
18361828
result = pc.strftime(tsa, options=options)
1837-
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1829+
expected = pa.array(ts.strftime(fmt))
18381830
assert result.equals(expected)
18391831

18401832
fmt = "%Y-%m-%dT%H:%M:%S"
18411833

18421834
# Default format
18431835
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
18441836
result = pc.strftime(tsa, options=pc.StrftimeOptions())
1845-
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1837+
expected = pa.array(ts.strftime(fmt))
18461838
assert result.equals(expected)
18471839

18481840
# Default format plus timezone
18491841
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
18501842
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
1851-
expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z")))
1843+
expected = pa.array(ts.strftime(fmt + "%Z"))
18521844
assert result.equals(expected)
18531845

18541846
# Pandas %S is equivalent to %S in arrow for unit="s"
18551847
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
18561848
options = pc.StrftimeOptions("%S")
18571849
result = pc.strftime(tsa, options=options)
1858-
expected = pa.array(_fix_timestamp(ts.strftime("%S")))
1850+
expected = pa.array(ts.strftime("%S"))
18591851
assert result.equals(expected)
18601852

18611853
# Pandas %S.%f is equivalent to %S in arrow for unit="us"
18621854
tsa = pa.array(ts, type=pa.timestamp("us", timezone))
18631855
options = pc.StrftimeOptions("%S")
18641856
result = pc.strftime(tsa, options=options)
1865-
expected = pa.array(_fix_timestamp(ts.strftime("%S.%f")))
1857+
expected = pa.array(ts.strftime("%S.%f"))
18661858
assert result.equals(expected)
18671859

18681860
# Test setting locale
18691861
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
18701862
options = pc.StrftimeOptions(fmt, locale="C")
18711863
result = pc.strftime(tsa, options=options)
1872-
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1864+
expected = pa.array(ts.strftime(fmt))
18731865
assert result.equals(expected)
18741866

18751867
# Test timestamps without timezone
18761868
fmt = "%Y-%m-%dT%H:%M:%S"
18771869
ts = pd.to_datetime(times)
18781870
tsa = pa.array(ts, type=pa.timestamp("s"))
18791871
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
1880-
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
1872+
expected = pa.array(ts.strftime(fmt))
1873+
18811874
# Positional format
18821875
assert pc.strftime(tsa, fmt) == result
18831876

@@ -1956,8 +1949,6 @@ def _check_datetime_components(timestamps, timezone=None):
19561949

19571950
@pytest.mark.pandas
19581951
def test_extract_datetime_components():
1959-
from pyarrow.vendored.version import Version
1960-
19611952
timestamps = ["1970-01-01T00:00:59.123456789",
19621953
"2000-02-29T23:23:23.999999999",
19631954
"2033-05-18T03:33:20.000000000",
@@ -1983,8 +1974,6 @@ def test_extract_datetime_components():
19831974
if sys.platform == 'win32':
19841975
# TODO: We should test on windows once ARROW-13168 is resolved.
19851976
pytest.skip('Timezone database is not available on Windows yet')
1986-
elif Version(pd.__version__) < Version('1.0.0'):
1987-
pytest.skip('Pandas < 1.0 extracts time components incorrectly.')
19881977
else:
19891978
for timezone in timezones:
19901979
_check_datetime_components(timestamps, timezone)
@@ -1995,8 +1984,6 @@ def test_extract_datetime_components():
19951984
@pytest.mark.skipif(sys.platform == 'win32',
19961985
reason="Timezone database is not available on Windows yet")
19971986
def test_assume_timezone():
1998-
from pyarrow.vendored.version import Version
1999-
20001987
ts_type = pa.timestamp("ns")
20011988
timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789",
20021989
"2000-02-29T23:23:23.999999999",
@@ -2040,31 +2027,29 @@ def test_assume_timezone():
20402027

20412028
timezone = "Europe/Brussels"
20422029

2043-
# nonexistent parameter was introduced in Pandas 0.24.0
2044-
if Version(pd.__version__) >= Version("0.24.0"):
2045-
options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
2046-
options_nonexistent_earliest = pc.AssumeTimezoneOptions(
2047-
timezone, ambiguous="raise", nonexistent="earliest")
2048-
options_nonexistent_latest = pc.AssumeTimezoneOptions(
2049-
timezone, ambiguous="raise", nonexistent="latest")
2050-
2051-
with pytest.raises(ValueError,
2052-
match="Timestamp doesn't exist in "
2053-
f"timezone '{timezone}'"):
2054-
pc.assume_timezone(nonexistent_array,
2055-
options=options_nonexistent_raise)
2056-
2057-
expected = pa.array(nonexistent.tz_localize(
2058-
timezone, nonexistent="shift_forward"))
2059-
result = pc.assume_timezone(
2060-
nonexistent_array, options=options_nonexistent_latest)
2061-
expected.equals(result)
2062-
2063-
expected = pa.array(nonexistent.tz_localize(
2064-
timezone, nonexistent="shift_backward"))
2065-
result = pc.assume_timezone(
2066-
nonexistent_array, options=options_nonexistent_earliest)
2067-
expected.equals(result)
2030+
options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
2031+
options_nonexistent_earliest = pc.AssumeTimezoneOptions(
2032+
timezone, ambiguous="raise", nonexistent="earliest")
2033+
options_nonexistent_latest = pc.AssumeTimezoneOptions(
2034+
timezone, ambiguous="raise", nonexistent="latest")
2035+
2036+
with pytest.raises(ValueError,
2037+
match="Timestamp doesn't exist in "
2038+
f"timezone '{timezone}'"):
2039+
pc.assume_timezone(nonexistent_array,
2040+
options=options_nonexistent_raise)
2041+
2042+
expected = pa.array(nonexistent.tz_localize(
2043+
timezone, nonexistent="shift_forward"))
2044+
result = pc.assume_timezone(
2045+
nonexistent_array, options=options_nonexistent_latest)
2046+
expected.equals(result)
2047+
2048+
expected = pa.array(nonexistent.tz_localize(
2049+
timezone, nonexistent="shift_backward"))
2050+
result = pc.assume_timezone(
2051+
nonexistent_array, options=options_nonexistent_earliest)
2052+
expected.equals(result)
20682053

20692054
options_ambiguous_raise = pc.AssumeTimezoneOptions(timezone)
20702055
options_ambiguous_latest = pc.AssumeTimezoneOptions(
@@ -2199,11 +2184,6 @@ def _check_temporal_rounding(ts, values, unit):
21992184
"second", "minute", "hour", "day"))
22002185
@pytest.mark.pandas
22012186
def test_round_temporal(unit):
2202-
from pyarrow.vendored.version import Version
2203-
2204-
if Version(pd.__version__) < Version('1.0.0'):
2205-
pytest.skip('Pandas < 1.0 rounds differently.')
2206-
22072187
values = (1, 2, 3, 4, 5, 6, 7, 10, 15, 24, 60, 250, 500, 750)
22082188
timestamps = [
22092189
"1923-07-07 08:52:35.203790336",

0 commit comments

Comments
 (0)