Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion bigframes/functions/remote_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def create_cloud_function(
package_requirements=None,
timeout_seconds=600,
max_instance_count=None,
vpc_connector=None,
):
"""Create a cloud function from the given user defined function."""

Expand Down Expand Up @@ -426,6 +427,8 @@ def create_cloud_function(
function.service_config.timeout_seconds = timeout_seconds
if max_instance_count is not None:
function.service_config.max_instance_count = max_instance_count
if vpc_connector is not None:
function.service_config.vpc_connector = vpc_connector
function.service_config.service_account_email = (
self._cloud_function_service_account
)
Expand Down Expand Up @@ -474,6 +477,7 @@ def provision_bq_remote_function(
max_batching_rows,
cloud_function_timeout,
cloud_function_max_instance_count,
cloud_function_vpc_connector,
):
"""Provision a BigQuery remote function."""
# If reuse of any existing function with the same name (indicated by the
Expand All @@ -500,6 +504,7 @@ def provision_bq_remote_function(
package_requirements,
cloud_function_timeout,
cloud_function_max_instance_count,
cloud_function_vpc_connector,
)
else:
logger.info(f"Cloud function {cloud_function_name} already exists.")
Expand Down Expand Up @@ -655,6 +660,7 @@ def remote_function(
max_batching_rows: Optional[int] = 1000,
cloud_function_timeout: Optional[int] = 600,
cloud_function_max_instances: Optional[int] = None,
cloud_function_vpc_connector: Optional[str] = None,
):
"""Decorator to turn a user defined function into a BigQuery remote function.

Expand Down Expand Up @@ -798,7 +804,12 @@ def remote_function(
control the spike in the billing. Higher setting can help
support processing larger scale data. When not specified, cloud
function's default setting applies. For more details see
https://cloud.google.com/functions/docs/configuring/max-instances
https://cloud.google.com/functions/docs/configuring/max-instances.
cloud_function_vpc_connector (str, Optional):
The VPC connector you would like to configure for your cloud
function. This is useful if your code needs access to data or
service(s) that are on a VPC network. See for more details
https://cloud.google.com/functions/docs/networking/connecting-vpc.
"""
if isinstance(input_types, type):
input_types = [input_types]
Expand Down Expand Up @@ -928,6 +939,7 @@ def wrapper(f):
max_batching_rows,
cloud_function_timeout,
cloud_function_max_instances,
cloud_function_vpc_connector,
)

# TODO: Move ibis logic to compiler step
Expand Down
2 changes: 2 additions & 0 deletions bigframes/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,7 @@ def remote_function(
max_batching_rows: Optional[int] = 1000,
cloud_function_timeout: Optional[int] = 600,
cloud_function_max_instances: Optional[int] = None,
cloud_function_vpc_connector: Optional[str] = None,
):
return global_session.with_default_session(
bigframes.session.Session.remote_function,
Expand All @@ -669,6 +670,7 @@ def remote_function(
max_batching_rows=max_batching_rows,
cloud_function_timeout=cloud_function_timeout,
cloud_function_max_instances=cloud_function_max_instances,
cloud_function_vpc_connector=cloud_function_vpc_connector,
)


Expand Down
14 changes: 13 additions & 1 deletion bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,11 @@ def read_pandas(
The pandas DataFrame will be persisted as a temporary BigQuery table, which can be
automatically recycled after the Session is closed.

.. note::
Data is inlined in the query SQL if it is small enough (roughly 5MB
or less in memory). Larger size data is loaded to a BigQuery table
instead.

**Examples:**

>>> import bigframes.pandas as bpd
Expand Down Expand Up @@ -1467,6 +1472,7 @@ def remote_function(
max_batching_rows: Optional[int] = 1000,
cloud_function_timeout: Optional[int] = 600,
cloud_function_max_instances: Optional[int] = None,
cloud_function_vpc_connector: Optional[str] = None,
):
"""Decorator to turn a user defined function into a BigQuery remote function. Check out
the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.
Expand Down Expand Up @@ -1588,7 +1594,12 @@ def remote_function(
control the spike in the billing. Higher setting can help
support processing larger scale data. When not specified, cloud
function's default setting applies. For more details see
https://cloud.google.com/functions/docs/configuring/max-instances
https://cloud.google.com/functions/docs/configuring/max-instances.
cloud_function_vpc_connector (str, Optional):
The VPC connector you would like to configure for your cloud
function. This is useful if your code needs access to data or
service(s) that are on a VPC network. See for more details
https://cloud.google.com/functions/docs/networking/connecting-vpc.
Returns:
callable: A remote function object pointing to the cloud assets created
in the background to support the remote execution. The cloud assets can be
Expand All @@ -1613,6 +1624,7 @@ def remote_function(
max_batching_rows=max_batching_rows,
cloud_function_timeout=cloud_function_timeout,
cloud_function_max_instances=cloud_function_max_instances,
cloud_function_vpc_connector=cloud_function_vpc_connector,
)

def read_gbq_function(
Expand Down
75 changes: 74 additions & 1 deletion tests/system/large/test_remote_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import tempfile
import textwrap

from google.api_core.exceptions import BadRequest, NotFound
from google.api_core.exceptions import BadRequest, InvalidArgument, NotFound
from google.cloud import bigquery, storage
import pandas
import pytest
Expand Down Expand Up @@ -1331,6 +1331,79 @@ def square_num(x):
)


@pytest.mark.flaky(retries=2, delay=120)
def test_remote_function_via_session_vpc(scalars_dfs):
# TODO(shobs): Automate the following set-up during testing in the test project.
#
# For upfront convenience, the following set up has been statically created
# in the project bigfrmames-dev-perf via cloud console:
#
# 1. Create a vpc connector as per
# https://cloud.google.com/vpc/docs/configure-serverless-vpc-access#gcloud
#
# $ gcloud compute networks vpc-access connectors create bigframes-vpc --project=bigframes-dev-perf --region=us-central1 --range 10.8.0.0/28
# Create request issued for: [bigframes-vpc]
# Waiting for operation [projects/bigframes-dev-perf/locations/us-central1/operations/f9f90df6-7cf4-4420-8c2f-b3952775dcfb] to complete...done.
# Created connector [bigframes-vpc].
#
# $ gcloud compute networks vpc-access connectors list --project=bigframes-dev-perf --region=us-central1
# CONNECTOR_ID REGION NETWORK IP_CIDR_RANGE SUBNET SUBNET_PROJECT MACHINE_TYPE MIN_INSTANCES MAX_INSTANCES MIN_THROUGHPUT MAX_THROUGHPUT STATE
# bigframes-vpc us-central1 default 10.8.0.0/28 e2-micro 2 10 200 1000 READY

project = "bigframes-dev-perf"
gcf_vpc_connector = "bigframes-vpc"

rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project))

try:

def square_num(x):
if x is None:
return x
return x * x

square_num_remote = rf_session.remote_function(
[int], int, reuse=False, cloud_function_vpc_connector=gcf_vpc_connector
)(square_num)

scalars_df, scalars_pandas_df = scalars_dfs

bf_int64_col = scalars_df["int64_col"]
bf_result_col = bf_int64_col.apply(square_num_remote)
bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas()

pd_int64_col = scalars_pandas_df["int64_col"]
pd_result_col = pd_int64_col.apply(square_num)
pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)

assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)

# Assert that the GCF is created with the intended vpc connector
gcf = rf_session.cloudfunctionsclient.get_function(
name=square_num_remote.bigframes_cloud_function
)
assert gcf.service_config.vpc_connector == gcf_vpc_connector
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
rf_session.bqclient, rf_session.cloudfunctionsclient, square_num_remote
)


def test_remote_function_via_session_vpc_invalid(session):
with pytest.raises(
InvalidArgument, match="400.*Serverless VPC Access connector is not found"
):

@session.remote_function(
[int], int, reuse=False, cloud_function_vpc_connector="does-not-exist"
)
def square_num(x):
if x is None:
return x
return x * x


@pytest.mark.parametrize(
("max_batching_rows"),
[
Expand Down
5 changes: 5 additions & 0 deletions third_party/bigframes_vendored/pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ def read_parquet(
Instead, set a serialized index column as the index and sort by
that in the resulting DataFrame.

.. note::
For non-"bigquery" engine, data is inlined in the query SQL if it is
small enough (roughly 5MB or less in memory). Larger size data is
loaded to a BigQuery table instead.

**Examples:**

>>> import bigframes.pandas as bpd
Expand Down
10 changes: 10 additions & 0 deletions third_party/bigframes_vendored/pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ def read_csv(
file. Instead, set a serialized index column as the index and sort by
that in the resulting DataFrame.

.. note::
For non-bigquery engine, data is inlined in the query SQL if it is
small enough (roughly 5MB or less in memory). Larger size data is
loaded to a BigQuery table instead.

**Examples:**

>>> import bigframes.pandas as bpd
Expand Down Expand Up @@ -167,6 +172,11 @@ def read_json(
file. Instead, set a serialized index column as the index and sort by
that in the resulting DataFrame.

.. note::
For non-bigquery engine, data is inlined in the query SQL if it is
small enough (roughly 5MB or less in memory). Larger size data is
loaded to a BigQuery table instead.

**Examples:**

>>> import bigframes.pandas as bpd
Expand Down
5 changes: 5 additions & 0 deletions third_party/bigframes_vendored/pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ def read_pickle(
If the content of the pickle file is a Series and its name attribute is None,
the name will be set to '0' by default.

.. note::
Data is inlined in the query SQL if it is small enough (roughly 5MB
or less in memory). Larger size data is loaded to a BigQuery table
instead.

**Examples:**

>>> import bigframes.pandas as bpd
Expand Down