Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 14 additions & 165 deletions bigframes/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,175 +17,24 @@
DataFrames from this package.
"""

from __future__ import annotations

import copy
from dataclasses import dataclass, field
import threading
from typing import Optional

import bigframes_vendored.pandas._config.config as pandas_config

import bigframes._config.bigquery_options as bigquery_options
import bigframes._config.compute_options as compute_options
import bigframes._config.display_options as display_options
import bigframes._config.experiment_options as experiment_options
import bigframes._config.sampling_options as sampling_options


@dataclass
class ThreadLocalConfig(threading.local):
# If unset, global settings will be used
bigquery_options: Optional[bigquery_options.BigQueryOptions] = None
# Note: use default factory instead of default instance so each thread initializes to default values
display_options: display_options.DisplayOptions = field(
default_factory=display_options.DisplayOptions
)
sampling_options: sampling_options.SamplingOptions = field(
default_factory=sampling_options.SamplingOptions
)
compute_options: compute_options.ComputeOptions = field(
default_factory=compute_options.ComputeOptions
)
experiment_options: experiment_options.ExperimentOptions = field(
default_factory=experiment_options.ExperimentOptions
)


class Options:
"""Global options affecting BigQuery DataFrames behavior."""

def __init__(self):
self.reset()

def reset(self) -> Options:
"""Reset the option settings to defaults.

Returns:
bigframes._config.Options: Options object with default values.
"""
self._local = ThreadLocalConfig()

# BigQuery options are special because they can only be set once per
# session, so we need an indicator as to whether we are using the
# thread-local session or the global session.
self._bigquery_options = bigquery_options.BigQueryOptions()
return self

def _init_bigquery_thread_local(self):
"""Initialize thread-local options, based on current global options."""

# Already thread-local, so don't reset any options that have been set
# already. No locks needed since this only modifies thread-local
# variables.
if self._local.bigquery_options is not None:
return

self._local.bigquery_options = copy.deepcopy(self._bigquery_options)
self._local.bigquery_options._session_started = False

@property
def bigquery(self) -> bigquery_options.BigQueryOptions:
"""Options to use with the BigQuery engine.

Returns:
bigframes._config.bigquery_options.BigQueryOptions:
Options for BigQuery engine.
"""
if self._local.bigquery_options is not None:
# The only way we can get here is if someone called
# _init_bigquery_thread_local.
return self._local.bigquery_options

return self._bigquery_options

@property
def display(self) -> display_options.DisplayOptions:
"""Options controlling object representation.

Returns:
bigframes._config.display_options.DisplayOptions:
Options for controlling object representation.
"""
return self._local.display_options

@property
def sampling(self) -> sampling_options.SamplingOptions:
"""Options controlling downsampling when downloading data
to memory.

The data can be downloaded into memory explicitly
(e.g., to_pandas, to_numpy, values) or implicitly (e.g.,
matplotlib plotting). This option can be overridden by
parameters in specific functions.

Returns:
bigframes._config.sampling_options.SamplingOptions:
Options for controlling downsampling.
"""
return self._local.sampling_options

@property
def compute(self) -> compute_options.ComputeOptions:
"""Thread-local options controlling object computation.

Returns:
bigframes._config.compute_options.ComputeOptions:
Thread-local options for controlling object computation
"""
return self._local.compute_options

@property
def experiments(self) -> experiment_options.ExperimentOptions:
"""Options controlling experiments

Returns:
bigframes._config.experiment_options.ExperimentOptions:
Thread-local options for controlling experiments
"""
return self._local.experiment_options

@property
def is_bigquery_thread_local(self) -> bool:
"""Indicator that we're using a thread-local session.

A thread-local session can be started by using
`with bigframes.option_context("bigquery.some_option", "some-value"):`.

Returns:
bool:
A boolean value, where a value is True if a thread-local session
is in use; otherwise False.
"""
return self._local.bigquery_options is not None

@property
def _allow_large_results(self) -> bool:
"""The effective 'allow_large_results' setting.

This value is `self.compute.allow_large_results` if set (not `None`),
otherwise it defaults to `self.bigquery.allow_large_results`.

Returns:
bool:
Whether large query results are permitted.
- `True`: The BigQuery result size limit (e.g., 10 GB) is removed.
- `False`: Results are restricted to this limit (potentially faster).
BigQuery will raise an error if this limit is exceeded.
"""
if self.compute.allow_large_results is None:
return self.bigquery.allow_large_results
return self.compute.allow_large_results


options = Options()
"""Global options for default session."""

option_context = pandas_config.option_context
from bigframes._config.bigquery_options import BigQueryOptions
from bigframes._config.compute_options import ComputeOptions
from bigframes._config.display_options import DisplayOptions
from bigframes._config.experiment_options import ExperimentOptions
from bigframes._config.global_options import option_context, Options
import bigframes._config.global_options as global_options
from bigframes._config.sampling_options import SamplingOptions

options = global_options.options
"""Global options for the default session."""

__all__ = (
"Options",
"options",
"option_context",
"BigQueryOptions",
"ComputeOptions",
"DisplayOptions",
"ExperimentOptions",
"SamplingOptions",
)
144 changes: 94 additions & 50 deletions bigframes/_config/compute_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class ComputeOptions:
>>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins")

>>> bpd.options.compute.maximum_bytes_billed = 500
>>> # df.to_pandas() # this should fail
>>> df.to_pandas() # this should fail # doctest: +SKIP
google.api_core.exceptions.InternalServerError: 500 Query exceeded limit for bytes billed: 500. 10485760 or higher required.

>>> bpd.options.compute.maximum_bytes_billed = None # reset option
Expand All @@ -53,68 +53,112 @@ class ComputeOptions:
>>> del bpd.options.compute.extra_query_labels["test1"]
>>> bpd.options.compute.extra_query_labels
{'test2': 'abc', 'test3': False}

Attributes:
ai_ops_confirmation_threshold (int | None):
Guards against unexpected processing of large amount of rows by semantic operators.
If the number of rows exceeds the threshold, the user will be asked to confirm
their operations to resume. The default value is 0. Set the value to None
to turn off the guard.

ai_ops_threshold_autofail (bool):
Guards against unexpected processing of large amount of rows by semantic operators.
When set to True, the operation automatically fails without asking for user inputs.

allow_large_results (bool | None):
Specifies whether query results can exceed 10 GB. Defaults to False. Setting this
to False (the default) restricts results to 10 GB for potentially faster execution;
BigQuery will raise an error if this limit is exceeded. Setting to True removes
this result size limit.

enable_multi_query_execution (bool | None):
If enabled, large queries may be factored into multiple smaller queries
in order to avoid generating queries that are too complex for the query
engine to handle. However this comes at the cost of increase cost and latency.

extra_query_labels (Dict[str, Any] | None):
Stores additional custom labels for query configuration.

maximum_bytes_billed (int | None):
Limits the bytes billed for query jobs. Queries that will have
bytes billed beyond this limit will fail (without incurring a
charge). If unspecified, this will be set to your project default.
See `maximum_bytes_billed`: https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed.

maximum_result_rows (int | None):
Limits the number of rows in an execution result. When converting
a BigQuery DataFrames object to a pandas DataFrame or Series (e.g.,
using ``.to_pandas()``, ``.peek()``, ``.__repr__()``, direct
iteration), the data is downloaded from BigQuery to the client
machine. This option restricts the number of rows that can be
downloaded. If the number of rows to be downloaded exceeds this
limit, a ``bigframes.exceptions.MaximumResultRowsExceeded``
exception is raised.

semantic_ops_confirmation_threshold (int | None):
.. deprecated:: 1.42.0
Semantic operators are deprecated. Please use AI operators instead

semantic_ops_threshold_autofail (bool):
.. deprecated:: 1.42.0
Semantic operators are deprecated. Please use AI operators instead
"""

ai_ops_confirmation_threshold: Optional[int] = 0
"""
Guards against unexpected processing of large amount of rows by semantic operators.

If the number of rows exceeds the threshold, the user will be asked to confirm
their operations to resume. The default value is 0. Set the value to None
to turn off the guard.

Returns:
Optional[int]: Number of rows.
"""

ai_ops_threshold_autofail: bool = False
"""
Guards against unexpected processing of large amount of rows by semantic operators.

When set to True, the operation automatically fails without asking for user inputs.

Returns:
bool: True if the guard is enabled.
"""

allow_large_results: Optional[bool] = None
"""
Specifies whether query results can exceed 10 GB.

Defaults to False. Setting this to False (the default) restricts results to
10 GB for potentially faster execution; BigQuery will raise an error if this
limit is exceeded. Setting to True removes this result size limit.


Returns:
bool | None: True if results > 10 GB are enabled.
"""
enable_multi_query_execution: bool = False
"""
If enabled, large queries may be factored into multiple smaller queries.

This is in order to avoid generating queries that are too complex for the
query engine to handle. However this comes at the cost of increase cost and
latency.


Returns:
bool | None: True if enabled.
"""

extra_query_labels: Dict[str, Any] = dataclasses.field(
default_factory=dict, init=False
)
"""
Stores additional custom labels for query configuration.

Returns:
Dict[str, Any] | None: Additional labels.
"""

maximum_bytes_billed: Optional[int] = None
"""
Limits the bytes billed for query jobs.

Queries that will have bytes billed beyond this limit will fail (without
incurring a charge). If unspecified, this will be set to your project
default. See `maximum_bytes_billed`:
https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed.

Returns:
int | None: Number of bytes, if set.
"""

maximum_result_rows: Optional[int] = None
"""
Limits the number of rows in an execution result.

When converting a BigQuery DataFrames object to a pandas DataFrame or Series
(e.g., using ``.to_pandas()``, ``.peek()``, ``.__repr__()``, direct
iteration), the data is downloaded from BigQuery to the client machine. This
option restricts the number of rows that can be downloaded. If the number
of rows to be downloaded exceeds this limit, a
``bigframes.exceptions.MaximumResultRowsExceeded`` exception is raised.

Returns:
int | None: Number of rows, if set.
"""

semantic_ops_confirmation_threshold: Optional[int] = 0
"""
Deprecated.

.. deprecated:: 1.42.0
Semantic operators are deprecated. Please use the functions in
:mod:`bigframes.bigquery.ai` instead.

"""

semantic_ops_threshold_autofail = False
"""
Deprecated.

.. deprecated:: 1.42.0
Semantic operators are deprecated. Please use the functions in
:mod:`bigframes.bigquery.ai` instead.

"""

def assign_extra_query_labels(self, **kwargs: Any) -> None:
"""
Expand Down
Loading