Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 22 additions & 23 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,40 +67,39 @@ def indicate_duplicates(
if keep not in ["first", "last", False]:
raise ValueError("keep must be one of 'first', 'last', or False'")

rownums = agg_expressions.WindowExpression(
agg_expressions.NullaryAggregation(
agg_ops.RowNumberOp(),
),
window=windows.unbound(grouping_keys=tuple(columns)),
)
count = agg_expressions.WindowExpression(
agg_expressions.NullaryAggregation(
agg_ops.SizeOp(),
),
window=windows.unbound(grouping_keys=tuple(columns)),
)

if keep == "first":
# Count how many copies occur up to current copy of value
# Discard this value if there are copies BEFORE
window_spec = windows.cumulative_rows(
grouping_keys=tuple(columns),
)
predicate = ops.gt_op.as_expr(rownums, ex.const(0))
elif keep == "last":
# Count how many copies occur up to current copy of values
# Discard this value if there are copies AFTER
window_spec = windows.inverse_cumulative_rows(
grouping_keys=tuple(columns),
)
predicate = ops.lt_op.as_expr(rownums, ops.sub_op.as_expr(count, ex.const(1)))
else: # keep == False
# Count how many copies of the value occur in entire series.
# Discard this value if there are copies ANYWHERE
window_spec = windows.unbound(grouping_keys=tuple(columns))
block, dummy = block.create_constant(1)
# use row number as will work even with partial ordering
block, val_count_col_id = block.apply_window_op(
dummy,
agg_ops.sum_op,
window_spec=window_spec,
)
block, duplicate_indicator = block.project_expr(
ops.gt_op.as_expr(val_count_col_id, ex.const(1))
predicate = ops.gt_op.as_expr(count, ex.const(1))

block = block.project_block_exprs(
[predicate],
labels=[None],
)
return (
block.drop_columns(
(
dummy,
val_count_col_id,
)
),
duplicate_indicator,
block,
block.value_columns[-1],
)


Expand Down
3 changes: 3 additions & 0 deletions bigframes/core/compile/polars/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,9 @@ def compile_agg_op(
return pl.col(*inputs).first()
if isinstance(op, agg_ops.LastOp):
return pl.col(*inputs).last()
if isinstance(op, agg_ops.RowNumberOp):
# pl.row_index is not yet stable enough to use here, and only supports polars>=1.32
return pl.int_range(pl.len(), dtype=pl.Int64)
if isinstance(op, agg_ops.ShiftOp):
return pl.col(*inputs).shift(op.periods)
if isinstance(op, agg_ops.DiffOp):
Expand Down
2 changes: 0 additions & 2 deletions bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,8 +626,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any") -> Index:
return Index(result)

def drop_duplicates(self, *, keep: __builtins__.str = "first") -> Index:
if keep is not False:
validations.enforce_ordered(self, "drop_duplicates")
block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
return Index(block)

Expand Down
4 changes: 0 additions & 4 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4989,8 +4989,6 @@ def drop_duplicates(
*,
keep: str = "first",
) -> DataFrame:
if keep is not False:
validations.enforce_ordered(self, "drop_duplicates(keep != False)")
if subset is None:
column_ids = self._block.value_columns
elif utils.is_list_like(subset):
Expand All @@ -5004,8 +5002,6 @@ def drop_duplicates(
return DataFrame(block)

def duplicated(self, subset=None, keep: str = "first") -> bigframes.series.Series:
if keep is not False:
validations.enforce_ordered(self, "duplicated(keep != False)")
if subset is None:
column_ids = self._block.value_columns
else:
Expand Down
4 changes: 0 additions & 4 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2227,8 +2227,6 @@ def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None)
return self.reindex(other.index, validate=validate)

def drop_duplicates(self, *, keep: str = "first") -> Series:
if keep is not False:
validations.enforce_ordered(self, "drop_duplicates(keep != False)")
block = block_ops.drop_duplicates(self._block, (self._value_column,), keep)
return Series(block)

Expand All @@ -2249,8 +2247,6 @@ def unique(self, keep_order=True) -> Series:
return Series(block.select_columns(result).reset_index())

def duplicated(self, keep: str = "first") -> Series:
if keep is not False:
validations.enforce_ordered(self, "duplicated(keep != False)")
block, indicator = block_ops.indicate_duplicates(
self._block, (self._value_column,), keep
)
Expand Down
24 changes: 24 additions & 0 deletions tests/system/large/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,27 @@ def test_cov_150_columns(scalars_df_numeric_150_columns_maybe_ordered):
check_index_type=False,
check_column_type=False,
)


@pytest.mark.parametrize(
("keep",),
[
("first",),
("last",),
(False,),
],
)
def test_drop_duplicates_unordered(
scalars_df_unordered, scalars_pandas_df_default_index, keep
):
uniq_scalar_rows = scalars_df_unordered.drop_duplicates(
subset="bool_col", keep=keep
)
uniq_pd_rows = scalars_pandas_df_default_index.drop_duplicates(
subset="bool_col", keep=keep
)

assert len(uniq_scalar_rows) == len(uniq_pd_rows)
assert len(uniq_scalar_rows.groupby("bool_col")) == len(
uniq_pd_rows.groupby("bool_col")
)