googleapis · tswast · Apr 30, 2024 · Apr 26, 2024 · Apr 29, 2024 · Apr 29, 2024
@@ -18,7 +18,6 @@
 
 import copy
 import datetime
-import itertools
 import logging
 import os
 import re
@@ -43,7 +42,6 @@
 # Even though the ibis.backends.bigquery import is unused, it's needed
 # to register new and replacement ops with the Ibis BigQuery backend.
 import bigframes_vendored.ibis.backends.bigquery  # noqa
-import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
 import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
 import bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet
 import bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers
@@ -62,7 +60,6 @@
 import google.cloud.storage as storage  # type: ignore
 import ibis
 import ibis.backends.bigquery as ibis_bigquery
-import ibis.expr.datatypes as ibis_dtypes
 import ibis.expr.types as ibis_types
 import numpy as np
 import pandas
@@ -80,7 +77,6 @@
 import bigframes.core as core
 import bigframes.core.blocks as blocks
 import bigframes.core.compile
-import bigframes.core.guid as guid
 import bigframes.core.nodes as nodes
 from bigframes.core.ordering import IntegerEncoding
 import bigframes.core.ordering as order
@@ -92,6 +88,7 @@
 from bigframes.functions.remote_function import read_gbq_function as bigframes_rgf
 from bigframes.functions.remote_function import remote_function as bigframes_rf
 import bigframes.session._io.bigquery as bigframes_io
+import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table
 import bigframes.session.clients
 import bigframes.version
 
@@ -692,59 +689,6 @@ def read_gbq_table(
             use_cache=use_cache,
         )
 
-    def _get_snapshot_sql_and_primary_key(
-        self,
-        table: google.cloud.bigquery.table.Table,
-        *,
-        api_name: str,
-        use_cache: bool = True,
-    ) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]:
-        """Create a read-only Ibis table expression representing a table.
-
-        If we can get a total ordering from the table, such as via primary key
-        column(s), then return those too so that ordering generation can be
-        avoided.
-        """
-        (
-            snapshot_timestamp,
-            table,
-        ) = bigframes_io.get_snapshot_datetime_and_table_metadata(
-            self.bqclient,
-            table_ref=table.reference,
-            api_name=api_name,
-            cache=self._df_snapshot,
-            use_cache=use_cache,
-        )
-
-        if table.location.casefold() != self._location.casefold():
-            raise ValueError(
-                f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}"
-            )
-
-        # If there are primary keys defined, the query engine assumes these
-        # columns are unique, even if the constraint is not enforced. We make
-        # the same assumption and use these columns as the total ordering keys.
-        primary_keys = None
-        if (
-            (table_constraints := getattr(table, "table_constraints", None)) is not None
-            and (primary_key := table_constraints.primary_key) is not None
-            # This will be False for either None or empty list.
-            # We want primary_keys = None if no primary keys are set.
-            and (columns := primary_key.columns)
-        ):
-            primary_keys = columns
-
-        try:
-            table_expression = self.ibis_client.sql(
-                bigframes_io.create_snapshot_sql(table.reference, snapshot_timestamp)
-            )
-        except google.api_core.exceptions.Forbidden as ex:
-            if "Drive credentials" in ex.message:
-                ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions."
-            raise
-
-        return table_expression, primary_keys
-
     def _read_gbq_table(
         self,
         query: str,
@@ -757,95 +701,104 @@ def _read_gbq_table(
     ) -> dataframe.DataFrame:
         import bigframes.dataframe as dataframe
 
+        # ---------------------------------
+        # Validate and transform parameters
+        # ---------------------------------
+
         if max_results and max_results <= 0:
-            raise ValueError("`max_results` should be a positive number.")
+            raise ValueError(
+                f"`max_results` should be a positive number, got {max_results}."
+            )
 
         table_ref = bigquery.table.TableReference.from_string(
             query, default_project=self.bqclient.project
         )
 
-        table = self.bqclient.get_table(table_ref)
-        (table_expression, primary_keys,) = self._get_snapshot_sql_and_primary_key(
-            table, api_name=api_name, use_cache=use_cache
+        # ---------------------------------
+        # Fetch table metadata and validate
+        # ---------------------------------
+
+        (time_travel_timestamp, table,) = bf_read_gbq_table.get_table_metadata(
+            self.bqclient,
+            table_ref=table_ref,
+            api_name=api_name,
+            cache=self._df_snapshot,
+            use_cache=use_cache,
         )
-        total_ordering_cols = primary_keys
 
-        if not index_col and primary_keys is not None:
-            index_col = primary_keys
+        if table.location.casefold() != self._location.casefold():
+            raise ValueError(
+                f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}"
+            )
+
+        # -----------------------------------------
+        # Create Ibis table expression and validate
+        # -----------------------------------------
+
+        # Use a time travel to make sure the DataFrame is deterministic, even
+        # if the underlying table changes.
+        table_expression = bf_read_gbq_table.get_ibis_time_travel_table(
+            self.ibis_client,
+            table_ref,
+            time_travel_timestamp,
+        )
 
         for key in columns:
             if key not in table_expression.columns:
                 raise ValueError(
                     f"Column '{key}' of `columns` not found in this table."
                 )
 
-        if isinstance(index_col, str):
-            index_cols: List[str] = [index_col]
-        else:
-            index_cols = list(index_col)
+        # ---------------------------------------
+        # Create a non-default index and validate
+        # ---------------------------------------
+
+        # TODO(b/337925142): Move index_cols creation to before we create the
+        # Ibis table expression so we don't have a "SELECT *" subquery in the
+        # query that checks for index uniqueness.
+
+        index_cols, is_index_unique = bf_read_gbq_table.get_index_cols_and_uniqueness(
+            bqclient=self.bqclient,
+            ibis_client=self.ibis_client,
+            table=table,
+            table_expression=table_expression,
+            index_col=index_col,
+            api_name=api_name,
+        )
 
         for key in index_cols:
             if key not in table_expression.columns:
                 raise ValueError(
                     f"Column `{key}` of `index_col` not found in this table."
                 )
 
+        # TODO(b/337925142): We should push down column filters when we get the time
+        # travel table to avoid "SELECT *" subqueries.
         if columns:
             table_expression = table_expression.select([*index_cols, *columns])
 
-        # If the index is unique and sortable, then we don't need to generate
-        # an ordering column.
-        ordering = None
-        if total_ordering_cols is not None:
-            # Note: currently, a table has a total ordering only when the
-            # primary key(s) are set on a table. The query engine assumes such
-            # columns are unique, even if not enforced.
-            ordering = order.ExpressionOrdering(
-                ordering_value_columns=tuple(
-                    order.ascending_over(column_id) for column_id in total_ordering_cols
-                ),
-                total_ordering_columns=frozenset(total_ordering_cols),
-            )
-            column_values = [table_expression[col] for col in table_expression.columns]
-            array_value = core.ArrayValue.from_ibis(
-                self,
-                table_expression,
-                columns=column_values,
-                hidden_ordering_columns=[],
-                ordering=ordering,
-            )
+        # ----------------------------
+        # Create ordering and validate
+        # ----------------------------
 
-        elif len(index_cols) != 0:
-            # We have index columns, lets see if those are actually total_order_columns
-            ordering = order.ExpressionOrdering(
-                ordering_value_columns=tuple(
-                    [order.ascending_over(column_id) for column_id in index_cols]
-                ),
-                total_ordering_columns=frozenset(index_cols),
-            )
-            is_total_ordering = self._check_index_uniqueness(
-                table_expression, index_cols
+        if is_index_unique:
+            array_value = bf_read_gbq_table.to_array_value_with_total_ordering(
+                session=self,
+                table_expression=table_expression,
+                total_ordering_cols=index_cols,
             )
-            if is_total_ordering:
-                column_values = [
-                    table_expression[col] for col in table_expression.columns
-                ]
-                array_value = core.ArrayValue.from_ibis(
-                    self,
-                    table_expression,
-                    columns=column_values,
-                    hidden_ordering_columns=[],
-                    ordering=ordering,
-                )
-            else:
-                array_value = self._create_total_ordering(
-                    table_expression, table_rows=table.num_rows
-                )
         else:
-            array_value = self._create_total_ordering(
-                table_expression, table_rows=table.num_rows
+            # Note: Even though we're adding a default ordering here, that's
+            # just so we have a deterministic total ordering. If the user
+            # specified a non-unique index, we still sort by that later.
+            array_value = bf_read_gbq_table.to_array_value_with_default_ordering(
+                session=self, table=table_expression, table_rows=table.num_rows
             )
 
+        # ----------------------------------------------------
+        # Create Block & default index if len(index_cols) == 0
+        # ----------------------------------------------------
+
         value_columns = [col for col in array_value.column_ids if col not in index_cols]
         block = blocks.Block(
             array_value,
@@ -862,27 +815,6 @@ def _read_gbq_table(
             df.sort_index()
         return df
 
-    def _check_index_uniqueness(
-        self, table: ibis_types.Table, index_cols: List[str]
-    ) -> bool:
-        distinct_table = table.select(*index_cols).distinct()
-        is_unique_sql = f"""WITH full_table AS (
-            {self.ibis_client.compile(table)}
-        ),
-        distinct_table AS (
-            {self.ibis_client.compile(distinct_table)}
-        )
-
-        SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`,
-        (SELECT COUNT(*) FROM distinct_table) AS `distinct_count`
-        """
-        results, _ = self._start_query(is_unique_sql)
-        row = next(iter(results))
-
-        total_count = row["total_count"]
-        distinct_count = row["distinct_count"]
-        return total_count == distinct_count
-
     def _read_bigquery_load_job(
         self,
         filepath_or_buffer: str | IO["bytes"],
@@ -1462,66 +1394,6 @@ def _create_empty_temp_table(
         )
         return bigquery.TableReference.from_string(table)
 
-    def _create_total_ordering(
-        self,
-        table: ibis_types.Table,
-        table_rows: Optional[int],
-    ) -> core.ArrayValue:
-        # Since this might also be used as the index, don't use the default
-        # "ordering ID" name.
-
-        # For small tables, 64 bits is enough to avoid collisions, 128 bits will never ever collide no matter what
-        # Assume table is large if table row count is unknown
-        use_double_hash = (
-            (table_rows is None) or (table_rows == 0) or (table_rows > 100000)
-        )
-
-        ordering_hash_part = guid.generate_guid("bigframes_ordering_")
-        ordering_hash_part2 = guid.generate_guid("bigframes_ordering_")
-        ordering_rand_part = guid.generate_guid("bigframes_ordering_")
-
-        # All inputs into hash must be non-null or resulting hash will be null
-        str_values = list(
-            map(lambda col: _convert_to_nonnull_string(table[col]), table.columns)
-        )
-        full_row_str = (
-            str_values[0].concat(*str_values[1:])
-            if len(str_values) > 1
-            else str_values[0]
-        )
-        full_row_hash = full_row_str.hash().name(ordering_hash_part)
-        # By modifying value slightly, we get another hash uncorrelated with the first
-        full_row_hash_p2 = (full_row_str + "_").hash().name(ordering_hash_part2)
-        # Used to disambiguate between identical rows (which will have identical hash)
-        random_value = ibis.random().name(ordering_rand_part)
-
-        order_values = (
-            [full_row_hash, full_row_hash_p2, random_value]
-            if use_double_hash
-            else [full_row_hash, random_value]
-        )
-
-        original_column_ids = table.columns
-        table_with_ordering = table.select(
-            itertools.chain(original_column_ids, order_values)
-        )
-
-        ordering = order.ExpressionOrdering(
-            ordering_value_columns=tuple(
-                order.ascending_over(col.get_name()) for col in order_values
-            ),
-            total_ordering_columns=frozenset(col.get_name() for col in order_values),
-        )
-        columns = [table_with_ordering[col] for col in original_column_ids]
-        hidden_columns = [table_with_ordering[col.get_name()] for col in order_values]
-        return core.ArrayValue.from_ibis(
-            self,
-            table_with_ordering,
-            columns,
-            hidden_ordering_columns=hidden_columns,
-            ordering=ordering,
-        )
-
     def _ibis_to_temp_table(
         self,
         table: ibis_types.Table,
@@ -2056,28 +1928,6 @@ def _can_cluster_bq(field: bigquery.SchemaField):
     )
 
 
-def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringValue:
-    col_type = column.type()
-    if (
-        col_type.is_numeric()
-        or col_type.is_boolean()
-        or col_type.is_binary()
-        or col_type.is_temporal()
-    ):
-        result = column.cast(ibis_dtypes.String(nullable=True))
-    elif col_type.is_geospatial():
-        result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text()
-    elif col_type.is_string():
-        result = column
-    else:
-        # TO_JSON_STRING works with all data types, but isn't the most efficient
-        # Needed for JSON, STRUCT and ARRAY datatypes
-        result = vendored_ibis_ops.ToJsonString(column).to_expr()  # type: ignore
-    # Escape backslashes and use backslash as delineator
-    escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\")  # type: ignore
-    return typing.cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped)
-
-
 def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict:
     """
     For backwards-compatibility, convert any previously client-side only