Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 69 additions & 219 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import copy
import datetime
import itertools
import logging
import os
import re
Expand All @@ -43,7 +42,6 @@
# Even though the ibis.backends.bigquery import is unused, it's needed
# to register new and replacement ops with the Ibis BigQuery backend.
import bigframes_vendored.ibis.backends.bigquery # noqa
import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
import bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet
import bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers
Expand All @@ -62,7 +60,6 @@
import google.cloud.storage as storage # type: ignore
import ibis
import ibis.backends.bigquery as ibis_bigquery
import ibis.expr.datatypes as ibis_dtypes
import ibis.expr.types as ibis_types
import numpy as np
import pandas
Expand All @@ -80,7 +77,6 @@
import bigframes.core as core
import bigframes.core.blocks as blocks
import bigframes.core.compile
import bigframes.core.guid as guid
import bigframes.core.nodes as nodes
from bigframes.core.ordering import IntegerEncoding
import bigframes.core.ordering as order
Expand All @@ -92,6 +88,7 @@
from bigframes.functions.remote_function import read_gbq_function as bigframes_rgf
from bigframes.functions.remote_function import remote_function as bigframes_rf
import bigframes.session._io.bigquery as bigframes_io
import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table
import bigframes.session.clients
import bigframes.version

Expand Down Expand Up @@ -692,59 +689,6 @@ def read_gbq_table(
use_cache=use_cache,
)

def _get_snapshot_sql_and_primary_key(
self,
table: google.cloud.bigquery.table.Table,
*,
api_name: str,
use_cache: bool = True,
) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]:
"""Create a read-only Ibis table expression representing a table.

If we can get a total ordering from the table, such as via primary key
column(s), then return those too so that ordering generation can be
avoided.
"""
(
snapshot_timestamp,
table,
) = bigframes_io.get_snapshot_datetime_and_table_metadata(
self.bqclient,
table_ref=table.reference,
api_name=api_name,
cache=self._df_snapshot,
use_cache=use_cache,
)

if table.location.casefold() != self._location.casefold():
raise ValueError(
f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}"
)

# If there are primary keys defined, the query engine assumes these
# columns are unique, even if the constraint is not enforced. We make
# the same assumption and use these columns as the total ordering keys.
primary_keys = None
if (
(table_constraints := getattr(table, "table_constraints", None)) is not None
and (primary_key := table_constraints.primary_key) is not None
# This will be False for either None or empty list.
# We want primary_keys = None if no primary keys are set.
and (columns := primary_key.columns)
):
primary_keys = columns

try:
table_expression = self.ibis_client.sql(
bigframes_io.create_snapshot_sql(table.reference, snapshot_timestamp)
)
except google.api_core.exceptions.Forbidden as ex:
if "Drive credentials" in ex.message:
ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions."
raise

return table_expression, primary_keys

def _read_gbq_table(
self,
query: str,
Expand All @@ -757,95 +701,104 @@ def _read_gbq_table(
) -> dataframe.DataFrame:
import bigframes.dataframe as dataframe

# ---------------------------------
# Validate and transform parameters
# ---------------------------------

if max_results and max_results <= 0:
raise ValueError("`max_results` should be a positive number.")
raise ValueError(
f"`max_results` should be a positive number, got {max_results}."
)

table_ref = bigquery.table.TableReference.from_string(
query, default_project=self.bqclient.project
)

table = self.bqclient.get_table(table_ref)
(table_expression, primary_keys,) = self._get_snapshot_sql_and_primary_key(
table, api_name=api_name, use_cache=use_cache
# ---------------------------------
# Fetch table metadata and validate
# ---------------------------------

(time_travel_timestamp, table,) = bf_read_gbq_table.get_table_metadata(
self.bqclient,
table_ref=table_ref,
api_name=api_name,
cache=self._df_snapshot,
use_cache=use_cache,
)
total_ordering_cols = primary_keys

if not index_col and primary_keys is not None:
index_col = primary_keys
if table.location.casefold() != self._location.casefold():
raise ValueError(
f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}"
)

# -----------------------------------------
# Create Ibis table expression and validate
# -----------------------------------------

# Use a time travel to make sure the DataFrame is deterministic, even
# if the underlying table changes.
table_expression = bf_read_gbq_table.get_ibis_time_travel_table(
self.ibis_client,
table_ref,
time_travel_timestamp,
)

for key in columns:
if key not in table_expression.columns:
raise ValueError(
f"Column '{key}' of `columns` not found in this table."
)

if isinstance(index_col, str):
index_cols: List[str] = [index_col]
else:
index_cols = list(index_col)
# ---------------------------------------
# Create a non-default index and validate
# ---------------------------------------

# TODO(b/337925142): Move index_cols creation to before we create the
# Ibis table expression so we don't have a "SELECT *" subquery in the
# query that checks for index uniqueness.

index_cols, is_index_unique = bf_read_gbq_table.get_index_cols_and_uniqueness(
bqclient=self.bqclient,
ibis_client=self.ibis_client,
table=table,
table_expression=table_expression,
index_col=index_col,
api_name=api_name,
)

for key in index_cols:
if key not in table_expression.columns:
raise ValueError(
f"Column `{key}` of `index_col` not found in this table."
)

# TODO(b/337925142): We should push down column filters when we get the time
# travel table to avoid "SELECT *" subqueries.
if columns:
table_expression = table_expression.select([*index_cols, *columns])

# If the index is unique and sortable, then we don't need to generate
# an ordering column.
ordering = None
if total_ordering_cols is not None:
# Note: currently, a table has a total ordering only when the
# primary key(s) are set on a table. The query engine assumes such
# columns are unique, even if not enforced.
ordering = order.ExpressionOrdering(
ordering_value_columns=tuple(
order.ascending_over(column_id) for column_id in total_ordering_cols
),
total_ordering_columns=frozenset(total_ordering_cols),
)
column_values = [table_expression[col] for col in table_expression.columns]
array_value = core.ArrayValue.from_ibis(
self,
table_expression,
columns=column_values,
hidden_ordering_columns=[],
ordering=ordering,
)
# ----------------------------
# Create ordering and validate
# ----------------------------

elif len(index_cols) != 0:
# We have index columns, lets see if those are actually total_order_columns
ordering = order.ExpressionOrdering(
ordering_value_columns=tuple(
[order.ascending_over(column_id) for column_id in index_cols]
),
total_ordering_columns=frozenset(index_cols),
)
is_total_ordering = self._check_index_uniqueness(
table_expression, index_cols
if is_index_unique:
array_value = bf_read_gbq_table.to_array_value_with_total_ordering(
session=self,
table_expression=table_expression,
total_ordering_cols=index_cols,
)
if is_total_ordering:
column_values = [
table_expression[col] for col in table_expression.columns
]
array_value = core.ArrayValue.from_ibis(
self,
table_expression,
columns=column_values,
hidden_ordering_columns=[],
ordering=ordering,
)
else:
array_value = self._create_total_ordering(
table_expression, table_rows=table.num_rows
)
else:
array_value = self._create_total_ordering(
table_expression, table_rows=table.num_rows
# Note: Even though we're adding a default ordering here, that's
# just so we have a deterministic total ordering. If the user
# specified a non-unique index, we still sort by that later.
array_value = bf_read_gbq_table.to_array_value_with_default_ordering(
session=self, table=table_expression, table_rows=table.num_rows
)

# ----------------------------------------------------
# Create Block & default index if len(index_cols) == 0
# ----------------------------------------------------

value_columns = [col for col in array_value.column_ids if col not in index_cols]
block = blocks.Block(
array_value,
Expand All @@ -862,27 +815,6 @@ def _read_gbq_table(
df.sort_index()
return df

def _check_index_uniqueness(
self, table: ibis_types.Table, index_cols: List[str]
) -> bool:
distinct_table = table.select(*index_cols).distinct()
is_unique_sql = f"""WITH full_table AS (
{self.ibis_client.compile(table)}
),
distinct_table AS (
{self.ibis_client.compile(distinct_table)}
)

SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`,
(SELECT COUNT(*) FROM distinct_table) AS `distinct_count`
"""
results, _ = self._start_query(is_unique_sql)
row = next(iter(results))

total_count = row["total_count"]
distinct_count = row["distinct_count"]
return total_count == distinct_count

def _read_bigquery_load_job(
self,
filepath_or_buffer: str | IO["bytes"],
Expand Down Expand Up @@ -1462,66 +1394,6 @@ def _create_empty_temp_table(
)
return bigquery.TableReference.from_string(table)

def _create_total_ordering(
self,
table: ibis_types.Table,
table_rows: Optional[int],
) -> core.ArrayValue:
# Since this might also be used as the index, don't use the default
# "ordering ID" name.

# For small tables, 64 bits is enough to avoid collisions, 128 bits will never ever collide no matter what
# Assume table is large if table row count is unknown
use_double_hash = (
(table_rows is None) or (table_rows == 0) or (table_rows > 100000)
)

ordering_hash_part = guid.generate_guid("bigframes_ordering_")
ordering_hash_part2 = guid.generate_guid("bigframes_ordering_")
ordering_rand_part = guid.generate_guid("bigframes_ordering_")

# All inputs into hash must be non-null or resulting hash will be null
str_values = list(
map(lambda col: _convert_to_nonnull_string(table[col]), table.columns)
)
full_row_str = (
str_values[0].concat(*str_values[1:])
if len(str_values) > 1
else str_values[0]
)
full_row_hash = full_row_str.hash().name(ordering_hash_part)
# By modifying value slightly, we get another hash uncorrelated with the first
full_row_hash_p2 = (full_row_str + "_").hash().name(ordering_hash_part2)
# Used to disambiguate between identical rows (which will have identical hash)
random_value = ibis.random().name(ordering_rand_part)

order_values = (
[full_row_hash, full_row_hash_p2, random_value]
if use_double_hash
else [full_row_hash, random_value]
)

original_column_ids = table.columns
table_with_ordering = table.select(
itertools.chain(original_column_ids, order_values)
)

ordering = order.ExpressionOrdering(
ordering_value_columns=tuple(
order.ascending_over(col.get_name()) for col in order_values
),
total_ordering_columns=frozenset(col.get_name() for col in order_values),
)
columns = [table_with_ordering[col] for col in original_column_ids]
hidden_columns = [table_with_ordering[col.get_name()] for col in order_values]
return core.ArrayValue.from_ibis(
self,
table_with_ordering,
columns,
hidden_ordering_columns=hidden_columns,
ordering=ordering,
)

def _ibis_to_temp_table(
self,
table: ibis_types.Table,
Expand Down Expand Up @@ -2056,28 +1928,6 @@ def _can_cluster_bq(field: bigquery.SchemaField):
)


def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringValue:
col_type = column.type()
if (
col_type.is_numeric()
or col_type.is_boolean()
or col_type.is_binary()
or col_type.is_temporal()
):
result = column.cast(ibis_dtypes.String(nullable=True))
elif col_type.is_geospatial():
result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text()
elif col_type.is_string():
result = column
else:
# TO_JSON_STRING works with all data types, but isn't the most efficient
# Needed for JSON, STRUCT and ARRAY datatypes
result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore
# Escape backslashes and use backslash as delineator
escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\") # type: ignore
return typing.cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped)


def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict:
"""
For backwards-compatibility, convert any previously client-side only
Expand Down
Loading