Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 23 additions & 26 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,19 @@ def indicate_duplicates(
if keep == "first":
# Count how many copies occur up to current copy of value
# Discard this value if there are copies BEFORE
window_spec = windows.WindowSpec(
window_spec = windows.cumulative_rows(
grouping_keys=tuple(columns),
following=0,
)
elif keep == "last":
# Count how many copies occur up to current copy of values
# Discard this value if there are copies AFTER
window_spec = windows.WindowSpec(
window_spec = windows.inverse_cumulative_rows(
grouping_keys=tuple(columns),
preceding=0,
)
else: # keep == False
# Count how many copies of the value occur in entire series.
# Discard this value if there are copies ANYWHERE
window_spec = windows.WindowSpec(grouping_keys=tuple(columns))
window_spec = windows.unbound(grouping_keys=tuple(columns))
block, dummy = block.create_constant(1)
block, val_count_col_id = block.apply_window_op(
dummy,
Expand Down Expand Up @@ -114,7 +112,7 @@ def quantile(
dropna: bool = False,
) -> blocks.Block:
# TODO: handle windowing and more interpolation methods
window = core.WindowSpec(
window = windows.unbound(
grouping_keys=tuple(grouping_column_ids),
)
quantile_cols = []
Expand Down Expand Up @@ -212,8 +210,8 @@ def _interpolate_column(
if interpolate_method not in ["linear", "nearest", "ffill"]:
raise ValueError("interpolate method not supported")
window_ordering = (ordering.OrderingExpression(ex.free_var(x_values)),)
backwards_window = windows.WindowSpec(following=0, ordering=window_ordering)
forwards_window = windows.WindowSpec(preceding=0, ordering=window_ordering)
backwards_window = windows.rows(following=0, ordering=window_ordering)
forwards_window = windows.rows(preceding=0, ordering=window_ordering)

# Note, this method may
block, notnull = block.apply_unary_op(column, ops.notnull_op)
Expand Down Expand Up @@ -364,7 +362,7 @@ def value_counts(
)
count_id = agg_ids[0]
if normalize:
unbound_window = windows.WindowSpec()
unbound_window = windows.unbound()
block, total_count_id = block.apply_window_op(
count_id, agg_ops.sum_op, unbound_window
)
Expand All @@ -388,7 +386,7 @@ def value_counts(

def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block:
column_labels = block.column_labels
window_spec = windows.WindowSpec(
window_spec = windows.rows(
preceding=periods if periods > 0 else None,
following=-periods if periods < 0 else None,
)
Expand Down Expand Up @@ -430,23 +428,22 @@ def rank(
ops.isnull_op,
)
nullity_col_ids.append(nullity_col_id)
window = windows.WindowSpec(
# BigQuery has syntax to reorder nulls with "NULLS FIRST/LAST", but that is unavailable through ibis presently, so must order on a separate nullity expression first.
ordering=(
ordering.OrderingExpression(
ex.free_var(col),
ordering.OrderingDirection.ASC
if ascending
else ordering.OrderingDirection.DESC,
na_last=(na_option in ["bottom", "keep"]),
),
window_ordering = (
ordering.OrderingExpression(
ex.free_var(col),
ordering.OrderingDirection.ASC
if ascending
else ordering.OrderingDirection.DESC,
na_last=(na_option in ["bottom", "keep"]),
),
)
# Count_op ignores nulls, so if na_option is "top" or "bottom", we instead count the nullity columns, where nulls have been mapped to bools
block, rownum_id = block.apply_window_op(
col if na_option == "keep" else nullity_col_id,
agg_ops.dense_rank_op if method == "dense" else agg_ops.count_op,
window_spec=window,
window_spec=windows.unbound(ordering=window_ordering)
if method == "dense"
else windows.rows(following=0, ordering=window_ordering),
skip_reproject_unsafe=(col != columns[-1]),
)
rownum_col_ids.append(rownum_id)
Expand All @@ -464,7 +461,7 @@ def rank(
block, result_id = block.apply_window_op(
rownum_col_ids[i],
agg_op,
window_spec=windows.WindowSpec(grouping_keys=(columns[i],)),
window_spec=windows.unbound(grouping_keys=(columns[i],)),
skip_reproject_unsafe=(i < (len(columns) - 1)),
)
post_agg_rownum_col_ids.append(result_id)
Expand Down Expand Up @@ -528,7 +525,7 @@ def nsmallest(
block, counter = block.apply_window_op(
column_ids[0],
agg_ops.rank_op,
window_spec=windows.WindowSpec(ordering=tuple(order_refs)),
window_spec=windows.unbound(ordering=tuple(order_refs)),
)
block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n)))
block = block.filter_by_id(condition)
Expand Down Expand Up @@ -558,7 +555,7 @@ def nlargest(
block, counter = block.apply_window_op(
column_ids[0],
agg_ops.rank_op,
window_spec=windows.WindowSpec(ordering=tuple(order_refs)),
window_spec=windows.unbound(ordering=tuple(order_refs)),
)
block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n)))
block = block.filter_by_id(condition)
Expand Down Expand Up @@ -653,7 +650,7 @@ def _mean_delta_to_power(
grouping_column_ids: typing.Sequence[str],
) -> typing.Tuple[blocks.Block, typing.Sequence[str]]:
"""Calculate (x-mean(x))^n. Useful for calculating moment statistics such as skew and kurtosis."""
window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids))
window = windows.unbound(grouping_keys=tuple(grouping_column_ids))
block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window)
delta_ids = []
for val_id, mean_val_id in zip(column_ids, mean_ids):
Expand Down Expand Up @@ -845,7 +842,7 @@ def _idx_extrema(
for idx_col in original_block.index_columns
],
]
window_spec = windows.WindowSpec(ordering=tuple(order_refs))
window_spec = windows.unbound(ordering=tuple(order_refs))
idx_col = original_block.index_columns[0]
block, result_col = block.apply_window_op(
idx_col, agg_ops.first_op, window_spec
Expand Down
7 changes: 4 additions & 3 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import bigframes.core.tree_properties as tree_properties
import bigframes.core.utils
import bigframes.core.utils as utils
import bigframes.core.window_spec as window_specs
import bigframes.dtypes
import bigframes.features
import bigframes.operations as ops
Expand Down Expand Up @@ -807,7 +808,7 @@ def multi_apply_window_op(
self,
columns: typing.Sequence[str],
op: agg_ops.WindowOp,
window_spec: core.WindowSpec,
window_spec: window_specs.WindowSpec,
*,
skip_null_groups: bool = False,
never_skip_nulls: bool = False,
Expand Down Expand Up @@ -866,7 +867,7 @@ def apply_window_op(
self,
column: str,
op: agg_ops.WindowOp,
window_spec: core.WindowSpec,
window_spec: window_specs.WindowSpec,
*,
result_label: Label = None,
skip_null_groups: bool = False,
Expand Down Expand Up @@ -2020,7 +2021,7 @@ def _is_monotonic(
return self._stats_cache[column_name][op_name]

period = 1
window = bigframes.core.WindowSpec(
window = window_specs.rows(
preceding=period,
following=None,
)
Expand Down
42 changes: 30 additions & 12 deletions bigframes/core/compile/compiled.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
OrderingExpression,
)
import bigframes.core.schema as schemata
from bigframes.core.window_spec import WindowSpec
from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec
import bigframes.dtypes
import bigframes.operations.aggregations as agg_ops

Expand Down Expand Up @@ -735,7 +735,9 @@ def project_window_op(
skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection
"""
column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name))
window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties)
window = self._ibis_window_from_spec(
window_spec, require_total_order=op.uses_total_row_ordering
)
bindings = {col: self._get_ibis_column(col) for col in self.column_ids}

window_op = agg_compiler.compile_analytic(
Expand Down Expand Up @@ -1162,7 +1164,9 @@ def _create_string_ordering_column(self) -> ibis_types.StringColumn:
def _compile_expression(self, expr: ex.Expression):
return op_compiler.compile_expression(expr, self._ibis_bindings)

def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False):
def _ibis_window_from_spec(
self, window_spec: WindowSpec, require_total_order: bool
):
group_by: typing.List[ibis_types.Value] = (
[
typing.cast(
Expand All @@ -1175,26 +1179,40 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal
)
if self._reduced_predicate is not None:
group_by.append(self._reduced_predicate)

# Construct ordering. There are basically 3 main cases
# 1. Order-independent op (aggregation, cut, rank) with unbound window - no ordering clause needed
# 2. Order-independent op (aggregation, cut, rank) with range window - use ordering clause, ties allowed
# 3. Order-depedenpent op (navigation functions, array_agg) or rows bounds - use total row order to break ties.
if window_spec.ordering:
order_by = _convert_ordering_to_table_values(
{**self._column_names, **self._hidden_ordering_column_names},
window_spec.ordering,
)
if not allow_ties:
# Most operator need an unambiguous ordering, so the table's total ordering is appended
if require_total_order or isinstance(window_spec.bounds, RowsWindowBounds):
# Some operators need an unambiguous ordering, so the table's total ordering is appended
order_by = tuple([*order_by, *self._ibis_order])
elif (window_spec.following is not None) or (window_spec.preceding is not None):
elif isinstance(window_spec.bounds, RowsWindowBounds):
# If window spec has following or preceding bounds, we need to apply an unambiguous ordering.
order_by = tuple(self._ibis_order)
else:
# Unbound grouping window. Suitable for aggregations but not for analytic function application.
order_by = None
return ibis.window(
preceding=window_spec.preceding,
following=window_spec.following,
order_by=order_by,
group_by=group_by,
)

bounds = window_spec.bounds
window = ibis.window(order_by=order_by, group_by=group_by)
if bounds is not None:
if isinstance(bounds, RangeWindowBounds):
window = window.preceding_following(
bounds.preceding, bounds.following, how="range"
)
if isinstance(bounds, RowsWindowBounds):
window = window.preceding_following(
bounds.preceding, bounds.following, how="rows"
)
else:
raise ValueError(f"unrecognized window bounds {bounds}")
return window

class Builder:
def __init__(
Expand Down
27 changes: 13 additions & 14 deletions bigframes/core/groupby/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import bigframes.core.ordering as order
import bigframes.core.utils as utils
import bigframes.core.window as windows
import bigframes.core.window_spec as window_specs
import bigframes.dataframe as df
import bigframes.dtypes as dtypes
import bigframes.operations.aggregations as agg_ops
Expand Down Expand Up @@ -217,15 +218,15 @@ def cumprod(self, *args, **kwargs) -> df.DataFrame:
return self._apply_window_op(agg_ops.product_op, numeric_only=True)

def shift(self, periods=1) -> series.Series:
window = core.WindowSpec(
window = window_specs.rows(
grouping_keys=tuple(self._by_col_ids),
preceding=periods if periods > 0 else None,
following=-periods if periods < 0 else None,
)
return self._apply_window_op(agg_ops.ShiftOp(periods), window=window)

def diff(self, periods=1) -> series.Series:
window = core.WindowSpec(
window = window_specs.rows(
grouping_keys=tuple(self._by_col_ids),
preceding=periods if periods > 0 else None,
following=-periods if periods < 0 else None,
Expand All @@ -234,7 +235,7 @@ def diff(self, periods=1) -> series.Series:

def rolling(self, window: int, min_periods=None) -> windows.Window:
# To get n size window, need current row and n-1 preceding rows.
window_spec = core.WindowSpec(
window_spec = window_specs.rows(
grouping_keys=tuple(self._by_col_ids),
preceding=window - 1,
following=0,
Expand All @@ -248,9 +249,8 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
)

def expanding(self, min_periods: int = 1) -> windows.Window:
window_spec = core.WindowSpec(
window_spec = window_specs.cumulative_rows(
grouping_keys=tuple(self._by_col_ids),
following=0,
min_periods=min_periods,
)
block = self._block.order_by(
Expand Down Expand Up @@ -424,8 +424,8 @@ def _apply_window_op(
numeric_only: bool = False,
):
"""Apply window op to groupby. Defaults to grouped cumulative window."""
window_spec = window or core.WindowSpec(
grouping_keys=tuple(self._by_col_ids), following=0
window_spec = window or window_specs.cumulative_rows(
grouping_keys=tuple(self._by_col_ids)
)
columns = self._aggregated_columns(numeric_only=numeric_only)
block, result_ids = self._block.multi_apply_window_op(
Expand Down Expand Up @@ -594,15 +594,15 @@ def cumcount(self, *args, **kwargs) -> series.Series:

def shift(self, periods=1) -> series.Series:
"""Shift index by desired number of periods."""
window = core.WindowSpec(
window = window_specs.rows(
grouping_keys=tuple(self._by_col_ids),
preceding=periods if periods > 0 else None,
following=-periods if periods < 0 else None,
)
return self._apply_window_op(agg_ops.ShiftOp(periods), window=window)

def diff(self, periods=1) -> series.Series:
window = core.WindowSpec(
window = window_specs.rows(
grouping_keys=tuple(self._by_col_ids),
preceding=periods if periods > 0 else None,
following=-periods if periods < 0 else None,
Expand All @@ -611,7 +611,7 @@ def diff(self, periods=1) -> series.Series:

def rolling(self, window: int, min_periods=None) -> windows.Window:
# To get n size window, need current row and n-1 preceding rows.
window_spec = core.WindowSpec(
window_spec = window_specs.rows(
grouping_keys=tuple(self._by_col_ids),
preceding=window - 1,
following=0,
Expand All @@ -629,9 +629,8 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
)

def expanding(self, min_periods: int = 1) -> windows.Window:
window_spec = core.WindowSpec(
window_spec = window_specs.cumulative_rows(
grouping_keys=tuple(self._by_col_ids),
following=0,
min_periods=min_periods,
)
block = self._block.order_by(
Expand Down Expand Up @@ -661,8 +660,8 @@ def _apply_window_op(
window: typing.Optional[core.WindowSpec] = None,
):
"""Apply window op to groupby. Defaults to grouped cumulative window."""
window_spec = window or core.WindowSpec(
grouping_keys=tuple(self._by_col_ids), following=0
window_spec = window or window_specs.cumulative_rows(
grouping_keys=tuple(self._by_col_ids)
)

label = self._value_name if not discard_name else None
Expand Down
6 changes: 3 additions & 3 deletions bigframes/core/reshape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
import pandas as pd

import bigframes.constants as constants
import bigframes.core as core
import bigframes.core.expression as ex
import bigframes.core.ordering as order
import bigframes.core.utils as utils
import bigframes.core.window_spec as window_specs
import bigframes.dataframe
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
Expand Down Expand Up @@ -159,7 +159,7 @@ def cut(
)

return x._apply_window_op(
agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec()
agg_ops.CutOp(bins, labels=labels), window_spec=window_specs.unbound()
)


Expand Down Expand Up @@ -189,7 +189,7 @@ def qcut(
block, result = block.apply_window_op(
x._value_column,
agg_ops.QcutOp(q), # type: ignore
window_spec=core.WindowSpec(
window_spec=window_specs.unbound(
grouping_keys=(nullity_id,),
ordering=(order.ascending_over(x._value_column),),
),
Expand Down
Loading