Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ENH: Add CoW optimization to interpolate
  • Loading branch information
phofl committed Feb 8, 2023
commit d13216120bdf2a8a5e15c20ea7c554cc70b51d9b
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,9 @@ Copy-on-Write improvements
- :meth:`DataFrame.to_period` / :meth:`Series.to_period`
- :meth:`DataFrame.truncate`
- :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
- :meth:`DataFrame.interpolate` / :meth:`Series.interpolate`
- :meth:`DataFrame.ffill` / :meth:`Series.ffill`
- :meth:`DataFrame.bfill` / :meth:`Series.bfill`
- :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
- :func:`concat`

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,10 @@ def _maybe_downcast(
# but ATM it breaks too much existing code.
# split and convert the blocks

return extend_blocks([blk.convert() for blk in blocks])
copy = True if not using_cow else False
return extend_blocks(
[blk.convert(using_cow=using_cow, copy=copy) for blk in blocks]
)

if downcast is None:
return blocks
Expand Down
164 changes: 164 additions & 0 deletions pandas/tests/copy_view/test_interp_fillna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import numpy as np
import pytest

from pandas import (
DataFrame,
NaT,
Series,
Timestamp,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array


@pytest.mark.parametrize("method", ["pad", "nearest", "linear"])
def test_interpolate_no_op(using_copy_on_write, method):
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()

result = df.interpolate(method=method)

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = 100

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"])
def test_interp_fill_functions(using_copy_on_write, func):
# Check that these takes the same code paths as interpolate
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()

result = getattr(df, func)()

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = 100

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("func", ["ffill", "bfill"])
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_triggers_copy(using_copy_on_write, vals, func):
df = DataFrame({"a": vals})
result = getattr(df, func)()

assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
if using_copy_on_write:
# Check that we don't have references when triggering a copy
assert result._mgr._has_no_reference(0)


@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals):
df = DataFrame({"a": vals})
arr = get_array(df, "a")
df.interpolate(method="linear", inplace=True)

assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
# Check that we don't have references when triggering a copy
assert df._mgr._has_no_reference(0)


@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_with_refs(using_copy_on_write, vals):
df = DataFrame({"a": [1, np.nan, 2]})
df_orig = df.copy()
arr = get_array(df, "a")
view = df[:]
df.interpolate(method="linear", inplace=True)

if using_copy_on_write:
# Check that copy was triggered in interpolate and that we don't
# have any references left
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a"))


def test_interpolate_cleaned_fill_method(using_copy_on_write):
# Check that "method is set to None" case works correctly
df = DataFrame({"a": ["a", np.nan, "c"], "b": 1})
df_orig = df.copy()

result = df.interpolate(method="asfreq")

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = Timestamp("2021-12-31")

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


def test_interpolate_object_convert_no_op(using_copy_on_write):
# Check that "method is set to None" case works correctly
df = DataFrame({"a": ["a", "b", "c"], "b": 1})
arr_a = get_array(df, "a")
df.interpolate(method="pad", inplace=True)

# No CoW makes a copy, it should not!
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr_a, get_array(df, "a"))


def test_interpolate_object_convert_copies(using_copy_on_write):
df = DataFrame({"a": Series([1, 2], dtype=object), "b": 1})
arr_a = get_array(df, "a")
df.interpolate(method="pad", inplace=True)

if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr_a, get_array(df, "a"))


def test_interpolate_downcast(using_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
arr_a = get_array(df, "a")
df.interpolate(method="pad", inplace=True, downcast="infer")

assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr_a, get_array(df, "a"))


def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write):
df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.interpolate(method="pad", inplace=True, downcast="infer")

if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr_a, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
else:
tm.assert_frame_equal(df, view)
108 changes: 0 additions & 108 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
DataFrame,
Index,
MultiIndex,
NaT,
Period,
Series,
Timestamp,
Expand Down Expand Up @@ -1253,113 +1252,6 @@ def test_putmask(using_copy_on_write):
assert view.iloc[0, 0] == 5


@pytest.mark.parametrize("method", ["pad", "nearest", "linear"])
def test_interpolate_no_op(using_copy_on_write, method):
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()

result = df.interpolate(method=method)

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = 100

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"])
def test_fill_functions(using_copy_on_write, func):
# Check that these takes the same code paths as interpolate
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()

result = getattr(df, func)()

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = 100

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"])
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_triggers_copy(using_copy_on_write, vals, func):
df = DataFrame({"a": vals})
result = getattr(df, func)()

assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
if using_copy_on_write:
# Check that we don't have references when triggering a copy
assert result._mgr._has_no_reference(0)


@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_no_reference(using_copy_on_write, vals):
df = DataFrame({"a": vals})
arr = get_array(df, "a")
df.interpolate(method="linear", inplace=True)

assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
# Check that we don't have references when triggering a copy
assert df._mgr._has_no_reference(0)


@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_with_refs(using_copy_on_write, vals):
df = DataFrame({"a": [1, np.nan, 2]})
df_orig = df.copy()
arr = get_array(df, "a")
view = df[:]
df.interpolate(method="linear", inplace=True)

if using_copy_on_write:
# Check that copy was triggered in interpolate and that we don't
# have any references left
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a"))


def test_interpolate_cleaned_fill_method(using_copy_on_write):
# Check that "method is set to None" case works correctly
df = DataFrame({"a": ["a", np.nan, "c"], "b": 1})
df_orig = df.copy()

result = df.interpolate(method="asfreq")

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))

result.iloc[0, 0] = Timestamp("2021-12-31")

if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


def test_asfreq_noop(using_copy_on_write):
df = DataFrame(
{"a": [0.0, None, 2.0, 3.0]},
Expand Down