forked from googleapis/python-bigquery-dataframes
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtile.py
More file actions
129 lines (112 loc) · 4.51 KB
/
Copy pathtile.py
File metadata and controls
129 lines (112 loc) · 4.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import typing
from typing import Iterable, Optional, Union
import bigframes_vendored.constants as constants
import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile
import pandas as pd
import bigframes.core.expression as ex
import bigframes.core.ordering as order
import bigframes.core.utils as utils
import bigframes.core.window_spec as window_specs
import bigframes.dataframe
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
import bigframes.series
def cut(
x: bigframes.series.Series,
bins: Union[
int,
pd.IntervalIndex,
Iterable,
],
*,
labels: Union[Iterable[str], bool, None] = None,
) -> bigframes.series.Series:
if isinstance(bins, int) and bins <= 0:
raise ValueError("`bins` should be a positive integer.")
if isinstance(bins, Iterable):
if isinstance(bins, pd.IntervalIndex):
as_index: pd.IntervalIndex = bins
bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
elif len(list(bins)) == 0:
raise ValueError("`bins` iterable should have at least one item")
elif isinstance(list(bins)[0], tuple):
as_index = pd.IntervalIndex.from_tuples(list(bins))
bins = tuple(bins)
elif pd.api.types.is_number(list(bins)[0]):
bins_list = list(bins)
if len(bins_list) < 2:
raise ValueError(
"`bins` iterable of numeric breaks should have"
" at least two items"
)
as_index = pd.IntervalIndex.from_breaks(bins_list)
single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list])
numeric_type = type(bins_list[0]) if single_type else float
bins = tuple(
[
(numeric_type(bins_list[i]), numeric_type(bins_list[i + 1]))
for i in range(len(bins_list) - 1)
]
)
else:
raise ValueError("`bins` iterable should contain tuples or numerics")
if as_index.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")
if labels is not None and labels is not False:
raise NotImplementedError(
"The 'labels' parameter must be either False or None. "
"Please provide a valid value for 'labels'."
)
return x._apply_window_op(
agg_ops.CutOp(bins, labels=labels), window_spec=window_specs.unbound()
)
cut.__doc__ = vendored_pandas_tile.cut.__doc__
def qcut(
x: bigframes.series.Series,
q: typing.Union[int, typing.Sequence[float]],
*,
labels: Optional[bool] = None,
duplicates: typing.Literal["drop", "error"] = "error",
) -> bigframes.series.Series:
if isinstance(q, int) and q <= 0:
raise ValueError("`q` should be a positive integer.")
if utils.is_list_like(q):
q = tuple(q)
if labels is not False:
raise NotImplementedError(
f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
)
if duplicates != "drop":
raise NotImplementedError(
f"Only duplicates='drop' is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
)
block = x._block
label = block.col_id_to_label[x._value_column]
block, nullity_id = block.apply_unary_op(x._value_column, ops.notnull_op)
block, result = block.apply_window_op(
x._value_column,
agg_ops.QcutOp(q), # type: ignore
window_spec=window_specs.unbound(
grouping_keys=(nullity_id,),
ordering=(order.ascending_over(x._value_column),),
),
)
block, result = block.project_expr(
ops.where_op.as_expr(result, nullity_id, ex.const(None)), label=label
)
return bigframes.series.Series(block.select_column(result))
qcut.__doc__ = vendored_pandas_tile.qcut.__doc__