ARROW-18173: [Python] Drop older versions of Pandas (<1.0) (apache#14631)

AlenkaF · jorisvandenbossche · web-flow · commit f769f6b32373 · 2022-11-22T11:54:38.000+01:00
This PR tries to make changes to drop older versions of pandas and support versions >= 1.0.0. The changes will have to be done in: - [x] the official documentation (pandas version support) - [x] the CI jobs supporting older pandas versions - [x] https://github.com/apache/arrow/blob/master/python/pyarrow/pandas-shim.pxi - [x] tests that are specifically testing features on older versions of pandas Lead-authored-by: Alenka Frim <frim.alenka@gmail.com> Co-authored-by: Alenka Frim <AlenkaF@users.noreply.github.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -54,7 +54,7 @@ jobs:
         name:
           - conda-python-docs
           - conda-python-3.8-nopandas
-          - conda-python-3.7-pandas-0.23
+          - conda-python-3.7-pandas-1.0
           - conda-python-3.9-pandas-latest
         include:
           - name: conda-python-docs
@@ -67,12 +67,12 @@ jobs:
             image: conda-python
             title: AMD64 Conda Python 3.8 Without Pandas
             python: 3.8
-          - name: conda-python-3.7-pandas-0.23
+          - name: conda-python-3.7-pandas-1.0
             cache: conda-python-3.7
             image: conda-python-pandas
-            title: AMD64 Conda Python 3.7 Pandas 0.23
+            title: AMD64 Conda Python 3.7 Pandas 1.0
             python: 3.7
-            pandas: 0.23
+            pandas: 1.0
             numpy: 1.16
           - name: conda-python-3.9-pandas-latest
             cache: conda-python-3.9
diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst
@@ -61,3 +61,18 @@ Installing from source
 ----------------------
 
 See :ref:`python-development`.
+
+Dependencies
+------------
+
+Required dependency
+
+* **NumPy 1.16.6** or higher.
+
+Optional dependencies
+
+* **pandas 1.0** or higher,
+* **cffi**.
+
+Additional packages PyArrow is compatible with are :ref:`fsspec <filesystem-fsspec>`
+and **pytz**, **dateutil** or **tzdata** package for timezones.
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
@@ -24,12 +24,6 @@
 import pyarrow.lib as ext
 from pyarrow import _feather
 from pyarrow._feather import FeatherError  # noqa: F401
-from pyarrow.vendored.version import Version
-
-
-def _check_pandas_version():
-    if _pandas_api.loose_version < Version('0.17.0'):
-        raise ImportError("feather requires pandas >= 0.17.0")
 
 
 class FeatherDataset:
@@ -96,7 +90,6 @@ def read_pandas(self, columns=None, use_threads=True):
         pandas.DataFrame
             Content of the file as a pandas DataFrame (of columns)
         """
-        _check_pandas_version()
         return self.read_table(columns=columns).to_pandas(
             use_threads=use_threads)
 
@@ -145,7 +138,6 @@ def write_feather(df, dest, compression=None, compression_level=None,
         limited legacy format
     """
     if _pandas_api.have_pandas:
-        _check_pandas_version()
         if (_pandas_api.has_sparse and
                 isinstance(df, _pandas_api.pd.SparseDataFrame)):
             df = df.to_dense()
@@ -230,7 +222,6 @@ def read_feather(source, columns=None, use_threads=True,
     -------
     df : pandas.DataFrame
     """
-    _check_pandas_version()
     return (read_table(
         source, columns=columns, memory_map=memory_map,
         use_threads=use_threads).to_pandas(use_threads=use_threads, **kwargs))
diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
@@ -59,16 +59,16 @@ cdef class _PandasAPIShim(object):
         self._version = pd.__version__
         self._loose_version = Version(pd.__version__)
 
-        if self._loose_version < Version('0.23.0'):
+        if self._loose_version < Version('1.0.0'):
             self._have_pandas = False
             if raise_:
                 raise ImportError(
-                    "pyarrow requires pandas 0.23.0 or above, pandas {} is "
+                    "pyarrow requires pandas 1.0.0 or above, pandas {} is "
                     "installed".format(self._version)
                 )
             else:
                 warnings.warn(
-                    "pyarrow requires pandas 0.23.0 or above, pandas {} is "
+                    "pyarrow requires pandas 1.0.0 or above, pandas {} is "
                     "installed. Therefore, pandas-specific integration is not "
                     "used.".format(self._version), stacklevel=2)
                 return
@@ -83,22 +83,12 @@ cdef class _PandasAPIShim(object):
             self._series, self._index, self._categorical_type,
             self._extension_array)
         self._extension_dtype = pd.api.extensions.ExtensionDtype
-        if self._loose_version >= Version('0.24.0'):
-            self._is_extension_array_dtype = \
-                pd.api.types.is_extension_array_dtype
-        else:
-            self._is_extension_array_dtype = None
-
+        self._is_extension_array_dtype = (
+            pd.api.types.is_extension_array_dtype)
         self._types_api = pd.api.types
         self._datetimetz_type = pd.api.types.DatetimeTZDtype
         self._have_pandas = True
-
-        if self._loose_version > Version('0.25'):
-            self.has_sparse = False
-        else:
-            self.has_sparse = True
-
-        self._pd024 = self._loose_version >= Version('0.24')
+        self.has_sparse = False
 
     cdef inline _check_import(self, bint raise_=True):
         if self._tried_importing_pandas:
@@ -232,10 +222,7 @@ cdef class _PandasAPIShim(object):
         self._check_import()
         if isinstance(obj.dtype, (self.pd.api.types.IntervalDtype,
                                   self.pd.api.types.PeriodDtype)):
-            if self._pd024:
-                # only since pandas 0.24, interval and period are stored as
-                # such in Series
-                return obj.array
+            return obj.array
         return obj.values
 
     def assert_frame_equal(self, *args, **kwargs):
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -1089,9 +1089,8 @@ def _pandas_type_to_numpy_type(pandas_type):
 
 
 def _get_multiindex_codes(mi):
-    # compat for pandas < 0.24 (MI labels renamed to codes).
     if isinstance(mi, _pandas_api.pd.MultiIndex):
-        return mi.codes if hasattr(mi, 'codes') else mi.labels
+        return mi.codes
     else:
         return None
 
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
@@ -250,13 +250,11 @@ def test_filters_equivalency(tempdir, use_legacy_dataset):
     result_df = table.to_pandas().reset_index(drop=True)
 
     # Check that all rows in the DF fulfill the filter
-    # Pandas 0.23.x has problems with indexing constant memoryviews in
-    # categoricals. Thus we need to make an explicit copy here with np.array.
-    df_filter_1 = (np.array(result_df['integer']) == 1) \
-        & (np.array(result_df['string']) != 'b') \
-        & (np.array(result_df['boolean']) == 'True')
+    df_filter_1 = (result_df['integer'] == 1) \
+        & (result_df['string'] != 'b') \
+        & (result_df['boolean'] == 'True')
     df_filter_2 = (np.array(result_df['integer']) == 0) \
-        & (np.array(result_df['boolean']) == 'False')
+        & (result_df['boolean'] == 'False')
     assert df_filter_1.sum() > 0
     assert df_filter_2.sum() > 0
     assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum())
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
@@ -26,7 +26,6 @@
 from pyarrow.tests.parquet.common import (
     parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
 from pyarrow.util import guid
-from pyarrow.vendored.version import Version
 
 try:
     import pyarrow.parquet as pq
@@ -561,10 +560,6 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset):
 def test_write_to_dataset_pandas_preserve_extensiondtypes(
     tempdir, use_legacy_dataset
 ):
-    # ARROW-8251 - preserve pandas extension dtypes in roundtrip
-    if Version(pd.__version__) < Version("1.0.0"):
-        pytest.skip("__arrow_array__ added to pandas in 1.0.0")
-
     df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
     df['col'] = df['col'].astype("Int64")
     table = pa.table(df)
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
@@ -1812,14 +1812,6 @@ def test_strptime():
 @pytest.mark.skipif(sys.platform == 'win32',
                     reason="Timezone database is not available on Windows yet")
 def test_strftime():
-    from pyarrow.vendored.version import Version
-
-    def _fix_timestamp(s):
-        if Version(pd.__version__) < Version("1.0.0"):
-            return s.to_series().replace("NaT", pd.NaT)
-        else:
-            return s
-
     times = ["2018-03-10 09:00", "2038-01-31 12:23", None]
     timezones = ["CET", "UTC", "Europe/Ljubljana"]
 
@@ -1834,50 +1826,51 @@ def _fix_timestamp(s):
             for fmt in formats:
                 options = pc.StrftimeOptions(fmt)
                 result = pc.strftime(tsa, options=options)
-                expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
+                expected = pa.array(ts.strftime(fmt))
                 assert result.equals(expected)
 
         fmt = "%Y-%m-%dT%H:%M:%S"
 
         # Default format
         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
         result = pc.strftime(tsa, options=pc.StrftimeOptions())
-        expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
+        expected = pa.array(ts.strftime(fmt))
         assert result.equals(expected)
 
         # Default format plus timezone
         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
         result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
-        expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z")))
+        expected = pa.array(ts.strftime(fmt + "%Z"))
         assert result.equals(expected)
 
         # Pandas %S is equivalent to %S in arrow for unit="s"
         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
         options = pc.StrftimeOptions("%S")
         result = pc.strftime(tsa, options=options)
-        expected = pa.array(_fix_timestamp(ts.strftime("%S")))
+        expected = pa.array(ts.strftime("%S"))
         assert result.equals(expected)
 
         # Pandas %S.%f is equivalent to %S in arrow for unit="us"
         tsa = pa.array(ts, type=pa.timestamp("us", timezone))
         options = pc.StrftimeOptions("%S")
         result = pc.strftime(tsa, options=options)
-        expected = pa.array(_fix_timestamp(ts.strftime("%S.%f")))
+        expected = pa.array(ts.strftime("%S.%f"))
         assert result.equals(expected)
 
         # Test setting locale
         tsa = pa.array(ts, type=pa.timestamp("s", timezone))
         options = pc.StrftimeOptions(fmt, locale="C")
         result = pc.strftime(tsa, options=options)
-        expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
+        expected = pa.array(ts.strftime(fmt))
         assert result.equals(expected)
 
     # Test timestamps without timezone
     fmt = "%Y-%m-%dT%H:%M:%S"
     ts = pd.to_datetime(times)
     tsa = pa.array(ts, type=pa.timestamp("s"))
     result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
-    expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
+    expected = pa.array(ts.strftime(fmt))
+
     # Positional format
     assert pc.strftime(tsa, fmt) == result
 
@@ -1956,8 +1949,6 @@ def _check_datetime_components(timestamps, timezone=None):
 
 @pytest.mark.pandas
 def test_extract_datetime_components():
-    from pyarrow.vendored.version import Version
-
     timestamps = ["1970-01-01T00:00:59.123456789",
                   "2000-02-29T23:23:23.999999999",
                   "2033-05-18T03:33:20.000000000",
@@ -1983,8 +1974,6 @@ def test_extract_datetime_components():
     if sys.platform == 'win32':
         # TODO: We should test on windows once ARROW-13168 is resolved.
         pytest.skip('Timezone database is not available on Windows yet')
-    elif Version(pd.__version__) < Version('1.0.0'):
-        pytest.skip('Pandas < 1.0 extracts time components incorrectly.')
     else:
         for timezone in timezones:
             _check_datetime_components(timestamps, timezone)
@@ -1995,8 +1984,6 @@ def test_extract_datetime_components():
 @pytest.mark.skipif(sys.platform == 'win32',
                     reason="Timezone database is not available on Windows yet")
 def test_assume_timezone():
-    from pyarrow.vendored.version import Version
-
     ts_type = pa.timestamp("ns")
     timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789",
                                  "2000-02-29T23:23:23.999999999",
@@ -2040,31 +2027,29 @@ def test_assume_timezone():
 
     timezone = "Europe/Brussels"
 
-    # nonexistent parameter was introduced in Pandas 0.24.0
-    if Version(pd.__version__) >= Version("0.24.0"):
-        options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
-        options_nonexistent_earliest = pc.AssumeTimezoneOptions(
-            timezone, ambiguous="raise", nonexistent="earliest")
-        options_nonexistent_latest = pc.AssumeTimezoneOptions(
-            timezone, ambiguous="raise", nonexistent="latest")
-
-        with pytest.raises(ValueError,
-                           match="Timestamp doesn't exist in "
-                                 f"timezone '{timezone}'"):
-            pc.assume_timezone(nonexistent_array,
-                               options=options_nonexistent_raise)
-
-        expected = pa.array(nonexistent.tz_localize(
-            timezone, nonexistent="shift_forward"))
-        result = pc.assume_timezone(
-            nonexistent_array, options=options_nonexistent_latest)
-        expected.equals(result)
-
-        expected = pa.array(nonexistent.tz_localize(
-            timezone, nonexistent="shift_backward"))
-        result = pc.assume_timezone(
-            nonexistent_array, options=options_nonexistent_earliest)
-        expected.equals(result)
+    options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
+    options_nonexistent_earliest = pc.AssumeTimezoneOptions(
+        timezone, ambiguous="raise", nonexistent="earliest")
+    options_nonexistent_latest = pc.AssumeTimezoneOptions(
+        timezone, ambiguous="raise", nonexistent="latest")
+
+    with pytest.raises(ValueError,
+                       match="Timestamp doesn't exist in "
+                       f"timezone '{timezone}'"):
+        pc.assume_timezone(nonexistent_array,
+                           options=options_nonexistent_raise)
+
+    expected = pa.array(nonexistent.tz_localize(
+        timezone, nonexistent="shift_forward"))
+    result = pc.assume_timezone(
+        nonexistent_array, options=options_nonexistent_latest)
+    expected.equals(result)
+
+    expected = pa.array(nonexistent.tz_localize(
+        timezone, nonexistent="shift_backward"))
+    result = pc.assume_timezone(
+        nonexistent_array, options=options_nonexistent_earliest)
+    expected.equals(result)
 
     options_ambiguous_raise = pc.AssumeTimezoneOptions(timezone)
     options_ambiguous_latest = pc.AssumeTimezoneOptions(
@@ -2199,11 +2184,6 @@ def _check_temporal_rounding(ts, values, unit):
                                   "second", "minute", "hour", "day"))
 @pytest.mark.pandas
 def test_round_temporal(unit):
-    from pyarrow.vendored.version import Version
-
-    if Version(pd.__version__) < Version('1.0.0'):
-        pytest.skip('Pandas < 1.0 rounds differently.')
-
     values = (1, 2, 3, 4, 5, 6, 7, 10, 15, 24, 60, 250, 500, 750)
     timestamps = [
         "1923-07-07 08:52:35.203790336",
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py