Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,10 +658,14 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]:
return None


def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
"""Get the supertype of the two types."""
if dtype1 == dtype2:
return dtype1
def lcd_type(*dtypes: Dtype) -> Dtype:
if len(dtypes) < 1:
raise ValueError("at least one dypes should be provided")
if len(dtypes) == 1:
return dtypes[0]
unique_dtypes = set(dtypes)
if len(unique_dtypes) == 1:
return unique_dtypes.pop()
# Implicit conversion currently only supported for numeric types
hierarchy: list[Dtype] = [
pd.BooleanDtype(),
Expand All @@ -670,9 +674,9 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
pd.ArrowDtype(pa.decimal256(76, 38)),
pd.Float64Dtype(),
]
if (dtype1 not in hierarchy) or (dtype2 not in hierarchy):
if any([dtype not in hierarchy for dtype in dtypes]):
return None
lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2))
lcd_index = max([hierarchy.index(dtype) for dtype in dtypes])
return hierarchy[lcd_index]


Expand Down
126 changes: 109 additions & 17 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2390,12 +2390,27 @@ def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods
def test_dataframe_agg_single_string(scalars_dfs):
numeric_cols = ["int64_col", "int64_too", "float64_col"]
scalars_df, scalars_pandas_df = scalars_dfs

bf_result = scalars_df[numeric_cols].agg("sum").to_pandas()
pd_result = scalars_pandas_df[numeric_cols].agg("sum")

# Pandas may produce narrower numeric types, but bigframes always produces Float64
pd_result = pd_result.astype("Float64")
pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
assert bf_result.dtype == "Float64"
pd.testing.assert_series_equal(
pd_result, bf_result, check_dtype=False, check_index_type=False
)


def test_dataframe_agg_int_single_string(scalars_dfs):
numeric_cols = ["int64_col", "int64_too", "bool_col"]
scalars_df, scalars_pandas_df = scalars_dfs

bf_result = scalars_df[numeric_cols].agg("sum").to_pandas()
pd_result = scalars_pandas_df[numeric_cols].agg("sum")

assert bf_result.dtype == "Int64"
pd.testing.assert_series_equal(
pd_result, bf_result, check_dtype=False, check_index_type=False
)


def test_dataframe_agg_multi_string(scalars_dfs):
Expand Down Expand Up @@ -2431,6 +2446,27 @@ def test_dataframe_agg_multi_string(scalars_dfs):
).all()


def test_dataframe_agg_int_multi_string(scalars_dfs):
numeric_cols = ["int64_col", "int64_too", "bool_col"]
aggregations = [
"sum",
"nunique",
"count",
]
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas()
pd_result = scalars_pandas_df[numeric_cols].agg(aggregations)

for dtype in bf_result.dtypes:
assert dtype == "Int64"

# Pandas may produce narrower numeric types
# Pandas has object index type
pd.testing.assert_frame_equal(
pd_result, bf_result, check_dtype=False, check_index_type=False
)


@skip_legacy_pandas
def test_df_describe(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
Expand Down Expand Up @@ -2959,6 +2995,58 @@ def test_loc_setitem_bool_series_scalar_error(scalars_dfs):
pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99


@pytest.mark.parametrize(
("col", "op"),
[
# Int aggregates
pytest.param("int64_col", lambda x: x.sum(), id="int-sum"),
pytest.param("int64_col", lambda x: x.min(), id="int-min"),
pytest.param("int64_col", lambda x: x.max(), id="int-max"),
pytest.param("int64_col", lambda x: x.count(), id="int-count"),
pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"),
# Float aggregates
pytest.param("float64_col", lambda x: x.count(), id="float-count"),
pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"),
# Bool aggregates
pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"),
pytest.param("bool_col", lambda x: x.count(), id="bool-count"),
pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"),
# String aggregates
pytest.param("string_col", lambda x: x.count(), id="string-count"),
pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"),
],
)
def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op):
bf_result = op(scalars_df_index[[col]]).to_pandas()
pd_result = op(scalars_pandas_df_index[[col]])

# Check dtype separately
assert bf_result.dtype == "Int64"

# Pandas may produce narrower numeric types
# Pandas has object index type
assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)


@pytest.mark.parametrize(
("col", "op"),
[
pytest.param("bool_col", lambda x: x.min(), id="bool-min"),
pytest.param("bool_col", lambda x: x.max(), id="bool-max"),
],
)
def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op):
bf_result = op(scalars_df_index[[col]]).to_pandas()
pd_result = op(scalars_pandas_df_index[[col]])

# Check dtype separately
assert bf_result.dtype == "boolean"

# Pandas may produce narrower numeric types
# Pandas has object index type
assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)


@pytest.mark.parametrize(
("ordered"),
[
Expand All @@ -2967,34 +3055,38 @@ def test_loc_setitem_bool_series_scalar_error(scalars_dfs):
],
)
@pytest.mark.parametrize(
("op"),
("op", "bf_dtype"),
[
(lambda x: x.sum(numeric_only=True)),
(lambda x: x.mean(numeric_only=True)),
(lambda x: x.min(numeric_only=True)),
(lambda x: x.max(numeric_only=True)),
(lambda x: x.std(numeric_only=True)),
(lambda x: x.var(numeric_only=True)),
(lambda x: x.count(numeric_only=False)),
(lambda x: x.nunique()),
(lambda x: x.sum(numeric_only=True), "Float64"),
(lambda x: x.mean(numeric_only=True), "Float64"),
(lambda x: x.min(numeric_only=True), "Float64"),
(lambda x: x.max(numeric_only=True), "Float64"),
(lambda x: x.std(numeric_only=True), "Float64"),
(lambda x: x.var(numeric_only=True), "Float64"),
(lambda x: x.count(numeric_only=False), "Int64"),
(lambda x: x.nunique(), "Int64"),
],
ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"],
)
def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ordered):
def test_dataframe_aggregates(
scalars_df_index, scalars_pandas_df_index, op, bf_dtype, ordered
):
col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"]
bf_series = op(scalars_df_index[col_names])
pd_series = op(scalars_pandas_df_index[col_names])
bf_result = bf_series.to_pandas(ordered=ordered)
pd_result = op(scalars_pandas_df_index[col_names])

# Check dtype separately
assert bf_result.dtype == bf_dtype

# Pandas may produce narrower numeric types, but bigframes always produces Float64
# Pandas has object index type
pd_series.index = pd_series.index.astype(pd.StringDtype(storage="pyarrow"))
assert_series_equal(
pd_series,
pd_result,
bf_result,
check_dtype=False,
check_index_type=False,
ignore_order=not ordered,
check_dtype=False,
)


Expand Down