forked from googleapis/python-bigquery-dataframes
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path__init__.py
More file actions
472 lines (420 loc) · 17.5 KB
/
Copy path__init__.py
File metadata and controls
472 lines (420 loc) · 17.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from dataclasses import dataclass
import functools
import io
import typing
from typing import Iterable, Sequence
import ibis.expr.types as ibis_types
import pandas
import pyarrow as pa
import pyarrow.feather as pa_feather
import bigframes.core.compile as compiling
import bigframes.core.expression as ex
import bigframes.core.guid
import bigframes.core.join_def as join_def
import bigframes.core.local_data as local_data
import bigframes.core.nodes as nodes
from bigframes.core.ordering import OrderingExpression
import bigframes.core.ordering as orderings
import bigframes.core.rewrite
import bigframes.core.schema as schemata
import bigframes.core.utils
from bigframes.core.window_spec import WindowSpec
import bigframes.dtypes
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
import bigframes.session._io.bigquery
if typing.TYPE_CHECKING:
from bigframes.session import Session
ORDER_ID_COLUMN = "bigframes_ordering_id"
PREDICATE_COLUMN = "bigframes_predicate"
@dataclass(frozen=True)
class ArrayValue:
"""
ArrayValue is an immutable type representing a 2D array with per-column types.
"""
node: nodes.BigFrameNode
@classmethod
def from_ibis(
cls,
session: Session,
table: ibis_types.Table,
columns: Sequence[ibis_types.Value],
hidden_ordering_columns: Sequence[ibis_types.Value],
ordering: orderings.ExpressionOrdering,
):
node = nodes.ReadGbqNode(
table=table,
table_session=session,
columns=tuple(
bigframes.dtypes.ibis_value_to_canonical_type(column)
for column in columns
),
hidden_ordering_columns=tuple(hidden_ordering_columns),
ordering=ordering,
)
return cls(node)
@classmethod
def from_pyarrow(cls, arrow_table: pa.Table, session: Session):
adapted_table = local_data.adapt_pa_table(arrow_table)
schema = local_data.arrow_schema_to_bigframes(adapted_table.schema)
iobytes = io.BytesIO()
pa_feather.write_feather(adapted_table, iobytes)
node = nodes.ReadLocalNode(
iobytes.getvalue(),
data_schema=schema,
session=session,
)
return cls(node)
@property
def column_ids(self) -> typing.Sequence[str]:
return self.schema.names
@property
def session(self) -> Session:
required_session = self.node.session
from bigframes import get_global_session
return (
required_session if (required_session is not None) else get_global_session()
)
@functools.cached_property
def schema(self) -> schemata.ArraySchema:
return self.node.schema
@functools.cached_property
def _compiled_schema(self) -> schemata.ArraySchema:
compiled = self._compile_unordered()
items = tuple(
schemata.SchemaItem(id, compiled.get_column_type(id))
for id in compiled.column_ids
)
return schemata.ArraySchema(items)
def _try_evaluate_local(self):
"""Use only for unit testing paths - not fully featured. Will throw exception if fails."""
import ibis
return ibis.pandas.connect({}).execute(
self._compile_ordered()._to_ibis_expr(ordering_mode="unordered")
)
def get_column_type(self, key: str) -> bigframes.dtypes.Dtype:
return self.schema.get_type(key)
def _compile_ordered(self) -> compiling.OrderedIR:
return compiling.compile_ordered_ir(self.node)
def _compile_unordered(self) -> compiling.UnorderedIR:
return compiling.compile_unordered_ir(self.node)
def row_count(self) -> ArrayValue:
"""Get number of rows in ArrayValue as a single-entry ArrayValue."""
return ArrayValue(nodes.RowCountNode(child=self.node))
# Operations
def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue:
"""Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression."""
predicate: ex.Expression = ex.free_var(predicate_id)
if keep_null:
predicate = ops.fillna_op.as_expr(predicate, ex.const(True))
return self.filter(predicate)
def filter(self, predicate: ex.Expression):
return ArrayValue(nodes.FilterNode(child=self.node, predicate=predicate))
def order_by(self, by: Sequence[OrderingExpression]) -> ArrayValue:
return ArrayValue(nodes.OrderByNode(child=self.node, by=tuple(by)))
def reversed(self) -> ArrayValue:
return ArrayValue(nodes.ReversedNode(child=self.node))
def promote_offsets(self, col_id: str) -> ArrayValue:
"""
Convenience function to promote copy of column offsets to a value column. Can be used to reset index.
"""
return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id))
def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue:
"""Append together multiple ArrayValue objects."""
return ArrayValue(
nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]]))
)
def project_to_id(self, expression: ex.Expression, output_id: str):
if output_id in self.column_ids: # Mutate case
exprs = [
((expression if (col_id == output_id) else ex.free_var(col_id)), col_id)
for col_id in self.column_ids
]
else: # append case
self_projection = (
(ex.free_var(col_id), col_id) for col_id in self.column_ids
)
exprs = [*self_projection, (expression, output_id)]
return ArrayValue(
nodes.ProjectionNode(
child=self.node,
assignments=tuple(exprs),
)
).merge_projections()
def assign(self, source_id: str, destination_id: str) -> ArrayValue:
if destination_id in self.column_ids: # Mutate case
exprs = [
(
(
ex.free_var(source_id)
if (col_id == destination_id)
else ex.free_var(col_id)
),
col_id,
)
for col_id in self.column_ids
]
else: # append case
self_projection = (
(ex.free_var(col_id), col_id) for col_id in self.column_ids
)
exprs = [*self_projection, (ex.free_var(source_id), destination_id)]
return ArrayValue(
nodes.ProjectionNode(
child=self.node,
assignments=tuple(exprs),
)
).merge_projections()
def assign_constant(
self,
destination_id: str,
value: typing.Any,
dtype: typing.Optional[bigframes.dtypes.Dtype],
) -> ArrayValue:
if pandas.isna(value):
# Need to assign a data type when value is NaN.
dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE
if destination_id in self.column_ids: # Mutate case
exprs = [
(
(
ex.const(value, dtype)
if (col_id == destination_id)
else ex.free_var(col_id)
),
col_id,
)
for col_id in self.column_ids
]
else: # append case
self_projection = (
(ex.free_var(col_id), col_id) for col_id in self.column_ids
)
exprs = [*self_projection, (ex.const(value, dtype), destination_id)]
return ArrayValue(
nodes.ProjectionNode(
child=self.node,
assignments=tuple(exprs),
)
).merge_projections()
def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
selections = ((ex.free_var(col_id), col_id) for col_id in column_ids)
return ArrayValue(
nodes.ProjectionNode(
child=self.node,
assignments=tuple(selections),
)
).merge_projections()
def drop_columns(self, columns: Iterable[str]) -> ArrayValue:
new_projection = (
(ex.free_var(col_id), col_id)
for col_id in self.column_ids
if col_id not in columns
)
return ArrayValue(
nodes.ProjectionNode(
child=self.node,
assignments=tuple(new_projection),
)
).merge_projections()
def aggregate(
self,
aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]],
by_column_ids: typing.Sequence[str] = (),
dropna: bool = True,
) -> ArrayValue:
"""
Apply aggregations to the expression.
Arguments:
aggregations: input_column_id, operation, output_column_id tuples
by_column_id: column id of the aggregation key, this is preserved through the transform
dropna: whether null keys should be dropped
"""
return ArrayValue(
nodes.AggregateNode(
child=self.node,
aggregations=tuple(aggregations),
by_column_ids=tuple(by_column_ids),
dropna=dropna,
)
)
def project_window_op(
self,
column_name: str,
op: agg_ops.UnaryWindowOp,
window_spec: WindowSpec,
output_name=None,
*,
never_skip_nulls=False,
skip_reproject_unsafe: bool = False,
) -> ArrayValue:
"""
Creates a new expression based on this expression with unary operation applied to one column.
column_name: the id of the input column present in the expression
op: the windowable operator to apply to the input column
window_spec: a specification of the window over which to apply the operator
output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided
never_skip_nulls: will disable null skipping for operators that would otherwise do so
skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection
"""
return ArrayValue(
nodes.WindowOpNode(
child=self.node,
column_name=column_name,
op=op,
window_spec=window_spec,
output_name=output_name,
never_skip_nulls=never_skip_nulls,
skip_reproject_unsafe=skip_reproject_unsafe,
)
)
def _reproject_to_table(self) -> ArrayValue:
"""
Internal operators that projects the internal representation into a
new ibis table expression where each value column is a direct
reference to a column in that table expression. Needed after
some operations such as window operations that cannot be used
recursively in projections.
"""
return ArrayValue(
nodes.ReprojectOpNode(
child=self.node,
)
)
def unpivot(
self,
row_labels: typing.Sequence[typing.Hashable],
unpivot_columns: typing.Sequence[
typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]]
],
*,
passthrough_columns: typing.Sequence[str] = (),
index_col_ids: typing.Sequence[str] = ["index"],
join_side: typing.Literal["left", "right"] = "left",
) -> ArrayValue:
"""
Unpivot ArrayValue columns.
Args:
row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument.
unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None.
passthrough_columns: Columns that will not be unpivoted. Column id will be preserved.
index_col_id (str): The column id to be used for the row labels.
Returns:
ArrayValue: The unpivoted ArrayValue
"""
# There will be N labels, used to disambiguate which of N source columns produced each output row
explode_offsets_id = bigframes.core.guid.generate_guid("unpivot_offsets_")
labels_array = self._create_unpivot_labels_array(row_labels, index_col_ids)
labels_array = labels_array.promote_offsets(explode_offsets_id)
# Unpivot creates N output rows for each input row, labels disambiguate these N rows
joined_array = self._cross_join_w_labels(labels_array, join_side)
# Build the output rows as a case statment that selects between the N input columns
unpivot_exprs = []
# Supports producing multiple stacked ouput columns for stacking only part of hierarchical index
for col_id, input_ids in unpivot_columns:
# row explode offset used to choose the input column
# we use offset instead of label as labels are not necessarily unique
cases = tuple(
(
ops.eq_op.as_expr(explode_offsets_id, ex.const(i)),
ex.free_var(id_or_null)
if (id_or_null is not None)
else ex.const(None),
)
for i, id_or_null in enumerate(input_ids)
)
col_expr = ops.case_when_op.as_expr(*cases)
unpivot_exprs.append((col_expr, col_id))
label_exprs = ((ex.free_var(id), id) for id in index_col_ids)
# passthrough columns are unchanged, just repeated N times each
passthrough_exprs = ((ex.free_var(id), id) for id in passthrough_columns)
return ArrayValue(
nodes.ProjectionNode(
child=joined_array.node,
assignments=(*label_exprs, *unpivot_exprs, *passthrough_exprs),
)
)
def _cross_join_w_labels(
self, labels_array: ArrayValue, join_side: typing.Literal["left", "right"]
) -> ArrayValue:
"""
Convert each row in self to N rows, one for each label in labels array.
"""
table_join_side = (
join_def.JoinSide.LEFT if join_side == "left" else join_def.JoinSide.RIGHT
)
labels_join_side = table_join_side.inverse()
labels_mappings = tuple(
join_def.JoinColumnMapping(labels_join_side, id, id)
for id in labels_array.schema.names
)
table_mappings = tuple(
join_def.JoinColumnMapping(table_join_side, id, id)
for id in self.schema.names
)
join = join_def.JoinDefinition(
conditions=(), mappings=(*labels_mappings, *table_mappings), type="cross"
)
if join_side == "left":
joined_array = self.join(labels_array, join_def=join)
else:
joined_array = labels_array.join(self, join_def=join)
return joined_array
def _create_unpivot_labels_array(
self,
former_column_labels: typing.Sequence[typing.Hashable],
col_ids: typing.Sequence[str],
) -> ArrayValue:
"""Create an ArrayValue from a list of label tuples."""
rows = []
for row_offset in range(len(former_column_labels)):
row_label = former_column_labels[row_offset]
row_label = (row_label,) if not isinstance(row_label, tuple) else row_label
row = {col_ids[i]: row_label[i] for i in range(len(col_ids))}
rows.append(row)
return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session)
def join(
self,
other: ArrayValue,
join_def: join_def.JoinDefinition,
allow_row_identity_join: bool = False,
):
join_node = nodes.JoinNode(
left_child=self.node,
right_child=other.node,
join=join_def,
allow_row_identity_join=allow_row_identity_join,
)
if allow_row_identity_join:
return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node))
return ArrayValue(join_node)
def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
assert len(column_ids) > 0
for column_id in column_ids:
assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))
return ArrayValue(
nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
)
def _uniform_sampling(self, fraction: float) -> ArrayValue:
"""Sampling the table on given fraction.
.. warning::
The row numbers of result is non-deterministic, avoid to use.
"""
return ArrayValue(nodes.RandomSampleNode(self.node, fraction))
def merge_projections(self) -> ArrayValue:
new_node = bigframes.core.rewrite.maybe_squash_projection(self.node)
return ArrayValue(new_node)