Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,24 @@ class PandasBatches(Iterator[pd.DataFrame]):
"""Interface for mutable objects with state represented by a block value object."""

def __init__(
self, pandas_batches: Iterator[pd.DataFrame], total_rows: Optional[int] = 0
self,
pandas_batches: Iterator[pd.DataFrame],
total_rows: Optional[int] = 0,
*,
total_bytes_processed: Optional[int] = 0,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this should default to None to indicate cases where query execution didn't happen?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think a default of None is probably safest? 0 should be a confident statement of zero bytes billed ideally?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think 0 should be used whenever we are sure we didn't burn any query resources though

):
self._dataframes: Iterator[pd.DataFrame] = pandas_batches
self._total_rows: Optional[int] = total_rows
self._total_bytes_processed: Optional[int] = total_bytes_processed

@property
def total_rows(self) -> Optional[int]:
return self._total_rows

@property
def total_bytes_processed(self) -> Optional[int]:
return self._total_bytes_processed

def __next__(self) -> pd.DataFrame:
return next(self._dataframes)

Expand Down Expand Up @@ -721,7 +730,9 @@ def to_pandas_batches(
if (total_rows is not None) and (max_results is not None):
total_rows = min(total_rows, max_results)

return PandasBatches(dfs, total_rows)
return PandasBatches(
dfs, total_rows, total_bytes_processed=execute_result.total_bytes_processed
)

def _copy_index_to_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
"""Set the index on pandas DataFrame to match this block."""
Expand Down
6 changes: 5 additions & 1 deletion bigframes/session/bq_caching_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,10 @@ def _export_gbq(
self.bqclient.update_table(table, ["schema"])

return executor.ExecuteResult(
row_iter.to_arrow_iterable(), array_value.schema, query_job
row_iter.to_arrow_iterable(),
array_value.schema,
query_job,
total_bytes_processed=row_iter.total_bytes_processed,
)

def dry_run(
Expand Down Expand Up @@ -671,6 +674,7 @@ def _execute_plan_gbq(
query_job=query_job,
total_bytes=size_bytes,
total_rows=iterator.total_rows,
total_bytes_processed=iterator.total_bytes_processed,
)


Expand Down
1 change: 1 addition & 0 deletions bigframes/session/direct_gbq_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def execute(
schema=plan.schema,
query_job=query_job,
total_rows=iterator.total_rows,
total_bytes_processed=iterator.total_bytes_processed,
)

def _run_execute_query(
Expand Down
1 change: 1 addition & 0 deletions bigframes/session/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class ExecuteResult:
query_job: Optional[bigquery.QueryJob] = None
total_bytes: Optional[int] = None
total_rows: Optional[int] = None
total_bytes_processed: Optional[int] = None

@property
def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
Expand Down
9 changes: 5 additions & 4 deletions tests/system/large/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,12 @@ def test_to_pandas_batches_override_global_option(
):
with bigframes.option_context(LARGE_TABLE_OPTION, False):
df = session.read_gbq(WIKIPEDIA_TABLE)
pages = list(
df.to_pandas_batches(
page_size=500, max_results=1500, allow_large_results=True
)
batches = df.sort_values("id").to_pandas_batches(
page_size=500, max_results=1500, allow_large_results=True
)
assert batches.total_rows > 0
assert batches.total_bytes_processed > 0
pages = list(batches)
assert all((len(page) <= 500) for page in pages)
assert sum(len(page) for page in pages) == 1500

Expand Down
3 changes: 3 additions & 0 deletions tests/system/small/session/test_read_gbq_colab.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi
batches = df.to_pandas_batches(
page_size=100,
)
assert batches.total_rows > 0
assert batches.total_bytes_processed is None # No additional query.

executions_after = maybe_ordered_session._metrics.execution_count

num_batches = 0
Expand Down
7 changes: 7 additions & 0 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,13 @@ def test_to_arrow_override_global_option(scalars_df_index):
assert scalars_df_index._query_job.destination.table_id == table_id


def test_to_pandas_batches_populates_total_bytes_processed(scalars_df_default_index):
batches = scalars_df_default_index.sort_values(
"int64_col"
).to_pandas_batches() # Do a sort to force query execution.
assert batches.total_bytes_processed > 0


def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
"""Verify to_pandas_batches() APIs returns the expected dtypes."""
expected = scalars_df_default_index.dtypes
Expand Down