Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[backport 2.3.x] TST (string dtype): resolve xfails in pandas/tests/copy_view (#60245) #60257

Open
wants to merge 2 commits into
base: 2.3.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 9 additions & 19 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
TYPE_CHECKING,
Callable,
ContextManager,
cast,
)
import warnings

Expand All @@ -23,8 +22,6 @@

from pandas.compat import pa_version_under10p1

from pandas.core.dtypes.common import is_string_dtype

import pandas as pd
from pandas import (
ArrowDtype,
Expand Down Expand Up @@ -83,8 +80,8 @@
with_csv_dialect,
)
from pandas.core.arrays import (
ArrowExtensionArray,
BaseMaskedArray,
ExtensionArray,
NumpyExtensionArray,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
Expand All @@ -96,7 +93,6 @@
NpDtype,
)

from pandas.core.arrays import ArrowExtensionArray

UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
Expand Down Expand Up @@ -530,24 +526,18 @@ def shares_memory(left, right) -> bool:
if isinstance(left, pd.core.arrays.IntervalArray):
return shares_memory(left._left, right) or shares_memory(left._right, right)

if (
isinstance(left, ExtensionArray)
and is_string_dtype(left.dtype)
and left.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left = cast("ArrowExtensionArray", left)
if (
isinstance(right, ExtensionArray)
and is_string_dtype(right.dtype)
and right.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
right = cast("ArrowExtensionArray", right)
if isinstance(left, ArrowExtensionArray):
if isinstance(right, ArrowExtensionArray):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left_pa_data = left._pa_array
right_pa_data = right._pa_array
left_buf1 = left_pa_data.chunk(0).buffers()[1]
right_buf1 = right_pa_data.chunk(0).buffers()[1]
return left_buf1 == right_buf1
return left_buf1.address == right_buf1.address
else:
# if we have one one ArrowExtensionArray and one other array, assume
# they can only share memory if they share the same numpy buffer
return np.shares_memory(left, right)

if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
# By convention, we'll say these share memory if they share *either*
Expand Down
22 changes: 12 additions & 10 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under12p0
import pandas.util._test_decorators as td
Expand Down Expand Up @@ -244,7 +242,6 @@ def test_astype_arrow_timestamp(using_copy_on_write):
)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes_infer_objects(using_copy_on_write):
ser = Series(["a", "b", "c"])
ser_orig = ser.copy()
Expand All @@ -256,30 +253,35 @@ def test_convert_dtypes_infer_objects(using_copy_on_write):
)

if using_copy_on_write:
assert np.shares_memory(get_array(ser), get_array(result))
assert tm.shares_memory(get_array(ser), get_array(result))
else:
assert not np.shares_memory(get_array(ser), get_array(result))

result.iloc[0] = "x"
tm.assert_series_equal(ser, ser_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes(using_copy_on_write):
def test_convert_dtypes(using_copy_on_write, using_infer_string):
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
df_orig = df.copy()
df2 = df.convert_dtypes()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
if using_infer_string and HAS_PYARROW:
# TODO the default nullable string dtype still uses python storage
# this should be changed to pyarrow if installed
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d"))

df2.iloc[0, 0] = "x"
df2.iloc[0, 1] = 10
tm.assert_frame_equal(df, df_orig)
1 change: 0 additions & 1 deletion pandas/tests/copy_view/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ def test_concat_copy_keyword(using_copy_on_write, copy):
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))


# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize(
"func",
[
Expand Down
38 changes: 21 additions & 17 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW
from pandas.errors import SettingWithCopyWarning

Expand Down Expand Up @@ -953,15 +951,19 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write):
tm.assert_frame_equal(df, df_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_infer_objects(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
def test_infer_objects(using_copy_on_write, using_infer_string):
df = DataFrame(
{"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
)
df_orig = df.copy()
df2 = df.infer_objects()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
if using_infer_string and HAS_PYARROW:
assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
else:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
Expand All @@ -975,19 +977,16 @@ def test_infer_objects(using_copy_on_write):
tm.assert_frame_equal(df, df_orig)


@pytest.mark.xfail(
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
)
def test_infer_objects_no_reference(using_copy_on_write):
def test_infer_objects_no_reference(using_copy_on_write, using_infer_string):
df = DataFrame(
{
"a": [1, 2],
"b": "c",
"b": Series(["x", "y"], dtype=object),
"c": 1,
"d": Series(
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
),
"e": "b",
"e": Series(["z", "w"], dtype=object),
}
)
df = df.infer_objects()
Expand All @@ -1001,16 +1000,22 @@ def test_infer_objects_no_reference(using_copy_on_write):
df.iloc[0, 3] = Timestamp("2018-12-31")
if using_copy_on_write:
assert np.shares_memory(arr_a, get_array(df, "a"))
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))
if using_infer_string and HAS_PYARROW:
# note that the underlying memory of arr_b has been copied anyway
# because of the assignment, but the EA is updated inplace so still
# appears the share memory
assert tm.shares_memory(arr_b, get_array(df, "b"))
else:
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))
assert np.shares_memory(arr_d, get_array(df, "d"))


def test_infer_objects_reference(using_copy_on_write):
df = DataFrame(
{
"a": [1, 2],
"b": "c",
"b": Series(["x", "y"], dtype=object),
"c": 1,
"d": Series(
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
Expand Down Expand Up @@ -1184,15 +1189,14 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ
assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize("decimals", [-1, 0, 1])
def test_round(using_copy_on_write, warn_copy_on_write, decimals):
df = DataFrame({"a": [1, 2], "b": "c"})
df_orig = df.copy()
df2 = df.round(decimals=decimals)

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
# TODO: Make inplace by using out parameter of ndarray.round?
if decimals >= 0:
# Ensure lazy copy if no-op
Expand Down
21 changes: 7 additions & 14 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW

from pandas import (
Categorical,
DataFrame,
Expand All @@ -14,7 +10,6 @@
from pandas.tests.copy_view.util import get_array


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"replace_kwargs",
[
Expand All @@ -31,15 +26,15 @@
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
Expand All @@ -61,26 +56,25 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write):
with tm.assert_cow_warning(warn_copy_on_write):
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert not np.shares_memory(arr, get_array(df, "a"))
assert not tm.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(arr, get_array(df, "a"))


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_replace_regex_inplace(using_copy_on_write):
df = DataFrame({"a": ["aaa", "bbb"]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))
assert tm.shares_memory(arr, get_array(df, "a"))

df_orig = df.copy()
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))


def test_replace_regex_inplace_no_op(using_copy_on_write):
Expand Down Expand Up @@ -356,14 +350,13 @@ def test_replace_empty_list(using_copy_on_write):
assert not df2._mgr._has_no_reference(0)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize("value", ["d", None])
def test_replace_object_list_inplace(using_copy_on_write, value):
df = DataFrame({"a": ["a", "b", "c"]})
df = DataFrame({"a": ["a", "b", "c"]}, dtype=object)
arr = get_array(df, "a")
df.replace(["c"], value, inplace=True)
if using_copy_on_write or value is None:
assert np.shares_memory(arr, get_array(df, "a"))
assert tm.shares_memory(arr, get_array(df, "a"))
else:
# This could be inplace
assert not np.shares_memory(arr, get_array(df, "a"))
Expand Down
Loading