googleapis · shobsi · May 8, 2024 · May 8, 2024 · May 8, 2024 · May 9, 2024
@@ -348,6 +348,7 @@ def create_cloud_function(
         package_requirements=None,
         timeout_seconds=600,
         max_instance_count=None,
+        vpc_connector=None,
     ):
         """Create a cloud function from the given user defined function."""
 
@@ -426,6 +427,8 @@ def create_cloud_function(
                 function.service_config.timeout_seconds = timeout_seconds
             if max_instance_count is not None:
                 function.service_config.max_instance_count = max_instance_count
+            if vpc_connector is not None:
+                function.service_config.vpc_connector = vpc_connector
             function.service_config.service_account_email = (
                 self._cloud_function_service_account
             )
@@ -474,6 +477,7 @@ def provision_bq_remote_function(
         max_batching_rows,
         cloud_function_timeout,
         cloud_function_max_instance_count,
+        cloud_function_vpc_connector,
     ):
         """Provision a BigQuery remote function."""
         # If reuse of any existing function with the same name (indicated by the
@@ -500,6 +504,7 @@ def provision_bq_remote_function(
                 package_requirements,
                 cloud_function_timeout,
                 cloud_function_max_instance_count,
+                cloud_function_vpc_connector,
             )
         else:
             logger.info(f"Cloud function {cloud_function_name} already exists.")
@@ -655,6 +660,7 @@ def remote_function(
     max_batching_rows: Optional[int] = 1000,
     cloud_function_timeout: Optional[int] = 600,
     cloud_function_max_instances: Optional[int] = None,
+    cloud_function_vpc_connector: Optional[str] = None,
 ):
     """Decorator to turn a user defined function into a BigQuery remote function.
 
@@ -798,7 +804,12 @@ def remote_function(
             control the spike in the billing. Higher setting can help
             support processing larger scale data. When not specified, cloud
             function's default setting applies. For more details see
-            https://cloud.google.com/functions/docs/configuring/max-instances
+            https://cloud.google.com/functions/docs/configuring/max-instances.
+        cloud_function_vpc_connector (str, Optional):
+            The VPC connector you would like to configure for your cloud
+            function. This is useful if your code needs access to data or
+            service(s) that are on a VPC network. See for more details
+            https://cloud.google.com/functions/docs/networking/connecting-vpc.
     """
     if isinstance(input_types, type):
         input_types = [input_types]
@@ -928,6 +939,7 @@ def wrapper(f):
             max_batching_rows,
             cloud_function_timeout,
             cloud_function_max_instances,
+            cloud_function_vpc_connector,
         )
 
         # TODO: Move ibis logic to compiler step

@@ -653,6 +653,7 @@ def remote_function(
     max_batching_rows: Optional[int] = 1000,
     cloud_function_timeout: Optional[int] = 600,
     cloud_function_max_instances: Optional[int] = None,
+    cloud_function_vpc_connector: Optional[str] = None,
 ):
     return global_session.with_default_session(
         bigframes.session.Session.remote_function,
@@ -669,6 +670,7 @@ def remote_function(
         max_batching_rows=max_batching_rows,
         cloud_function_timeout=cloud_function_timeout,
         cloud_function_max_instances=cloud_function_max_instances,
+        cloud_function_vpc_connector=cloud_function_vpc_connector,
     )
 
 

@@ -934,6 +934,11 @@ def read_pandas(
         The pandas DataFrame will be persisted as a temporary BigQuery table, which can be
         automatically recycled after the Session is closed.
 
+        .. note::
+            Data is inlined in the query SQL if it is small enough (roughly 5MB
+            or less in memory). Larger size data is loaded to a BigQuery table
+            instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd
@@ -1467,6 +1472,7 @@ def remote_function(
         max_batching_rows: Optional[int] = 1000,
         cloud_function_timeout: Optional[int] = 600,
         cloud_function_max_instances: Optional[int] = None,
+        cloud_function_vpc_connector: Optional[str] = None,
     ):
         """Decorator to turn a user defined function into a BigQuery remote function. Check out
         the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.
@@ -1588,7 +1594,12 @@ def remote_function(
                 control the spike in the billing. Higher setting can help
                 support processing larger scale data. When not specified, cloud
                 function's default setting applies. For more details see
-                https://cloud.google.com/functions/docs/configuring/max-instances
+                https://cloud.google.com/functions/docs/configuring/max-instances.
+            cloud_function_vpc_connector (str, Optional):
+                The VPC connector you would like to configure for your cloud
+                function. This is useful if your code needs access to data or
+                service(s) that are on a VPC network. See for more details
+                https://cloud.google.com/functions/docs/networking/connecting-vpc.
         Returns:
             callable: A remote function object pointing to the cloud assets created
             in the background to support the remote execution. The cloud assets can be
@@ -1613,6 +1624,7 @@ def remote_function(
             max_batching_rows=max_batching_rows,
             cloud_function_timeout=cloud_function_timeout,
             cloud_function_max_instances=cloud_function_max_instances,
+            cloud_function_vpc_connector=cloud_function_vpc_connector,
         )
 
     def read_gbq_function(

@@ -20,7 +20,7 @@
 import tempfile
 import textwrap
 
-from google.api_core.exceptions import BadRequest, NotFound
+from google.api_core.exceptions import BadRequest, InvalidArgument, NotFound
 from google.cloud import bigquery, storage
 import pandas
 import pytest
@@ -1331,6 +1331,79 @@ def square_num(x):
         )
 
 
+@pytest.mark.flaky(retries=2, delay=120)
+def test_remote_function_via_session_vpc(scalars_dfs):
+    # TODO(shobs): Automate the following set-up during testing in the test project.
+    #
+    # For upfront convenience, the following set up has been statically created
+    # in the project bigfrmames-dev-perf via cloud console:
+    #
+    # 1. Create a vpc connector as per
+    #    https://cloud.google.com/vpc/docs/configure-serverless-vpc-access#gcloud
+    #
+    #    $ gcloud compute networks vpc-access connectors create bigframes-vpc --project=bigframes-dev-perf --region=us-central1 --range 10.8.0.0/28
+    #    Create request issued for: [bigframes-vpc]
+    #    Waiting for operation [projects/bigframes-dev-perf/locations/us-central1/operations/f9f90df6-7cf4-4420-8c2f-b3952775dcfb] to complete...done.
+    #    Created connector [bigframes-vpc].
+    #
+    #    $ gcloud compute networks vpc-access connectors list --project=bigframes-dev-perf --region=us-central1
+    #    CONNECTOR_ID   REGION       NETWORK  IP_CIDR_RANGE  SUBNET  SUBNET_PROJECT  MACHINE_TYPE  MIN_INSTANCES  MAX_INSTANCES  MIN_THROUGHPUT  MAX_THROUGHPUT  STATE
+    #    bigframes-vpc  us-central1  default  10.8.0.0/28                            e2-micro      2              10             200             1000            READY
+
+    project = "bigframes-dev-perf"
+    gcf_vpc_connector = "bigframes-vpc"
+
+    rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project))
+
+    try:
+
+        def square_num(x):
+            if x is None:
+                return x
+            return x * x
+
+        square_num_remote = rf_session.remote_function(
+            [int], int, reuse=False, cloud_function_vpc_connector=gcf_vpc_connector
+        )(square_num)
+
+        scalars_df, scalars_pandas_df = scalars_dfs
+
+        bf_int64_col = scalars_df["int64_col"]
+        bf_result_col = bf_int64_col.apply(square_num_remote)
+        bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas()
+
+        pd_int64_col = scalars_pandas_df["int64_col"]
+        pd_result_col = pd_int64_col.apply(square_num)
+        pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
+
+        assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
+
+        # Assert that the GCF is created with the intended vpc connector
+        gcf = rf_session.cloudfunctionsclient.get_function(
+            name=square_num_remote.bigframes_cloud_function
+        )
+        assert gcf.service_config.vpc_connector == gcf_vpc_connector
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_remote_function_assets(
+            rf_session.bqclient, rf_session.cloudfunctionsclient, square_num_remote
+        )
+
+
+def test_remote_function_via_session_vpc_invalid(session):
+    with pytest.raises(
+        InvalidArgument, match="400.*Serverless VPC Access connector is not found"
+    ):
+
+        @session.remote_function(
+            [int], int, reuse=False, cloud_function_vpc_connector="does-not-exist"
+        )
+        def square_num(x):
+            if x is None:
+                return x
+            return x * x
+
+
 @pytest.mark.parametrize(
     ("max_batching_rows"),
     [

@@ -19,6 +19,11 @@ def read_parquet(
             Instead, set a serialized index column as the index and sort by
             that in the resulting DataFrame.
 
+        .. note::
+            For non-"bigquery" engine, data is inlined in the query SQL if it is
+            small enough (roughly 5MB or less in memory). Larger size data is
+            loaded to a BigQuery table instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd

@@ -62,6 +62,11 @@ def read_csv(
             file. Instead, set a serialized index column as the index and sort by
             that in the resulting DataFrame.
 
+        .. note::
+            For non-bigquery engine, data is inlined in the query SQL if it is
+            small enough (roughly 5MB or less in memory). Larger size data is
+            loaded to a BigQuery table instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd
@@ -167,6 +172,11 @@ def read_json(
             file. Instead, set a serialized index column as the index and sort by
             that in the resulting DataFrame.
 
+        .. note::
+            For non-bigquery engine, data is inlined in the query SQL if it is
+            small enough (roughly 5MB or less in memory). Larger size data is
+            loaded to a BigQuery table instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd

@@ -25,6 +25,11 @@ def read_pickle(
             If the content of the pickle file is a Series and its name attribute is None,
             the name will be set to '0' by default.
 
+        .. note::
+            Data is inlined in the query SQL if it is small enough (roughly 5MB
+            or less in memory). Larger size data is loaded to a BigQuery table
+            instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd