googleapis · sycai · Feb 4, 2026 · Dec 18, 2025 · Jan 16, 2026 · Jan 16, 2026
@@ -19,7 +19,7 @@
 from __future__ import annotations
 
 import json
-from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union
+from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union
 
 import pandas as pd
 
@@ -28,6 +28,7 @@
 from bigframes import series, session
 from bigframes.core import convert
 from bigframes.core.logging import log_adapter
+import bigframes.core.sql.literals
 from bigframes.ml import core as ml_core
 from bigframes.operations import ai_ops, output_schemas
 
@@ -388,6 +389,113 @@ def generate_double(
     return series_list[0]._apply_nary_op(operator, series_list[1:])
 
 
+@log_adapter.method_logger(custom_base_name="bigquery_ai")
+def generate_embedding(
+    model_name: str,
+    data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series],
+    *,
+    output_dimensionality: Optional[int] = None,
+    task_type: Optional[str] = None,
+    start_second: Optional[float] = None,
+    end_second: Optional[float] = None,
+    interval_seconds: Optional[float] = None,
+    trial_id: Optional[int] = None,
+) -> dataframe.DataFrame:
+    """
+    Creates embeddings that describe an entity—for example, a piece of text or an image.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
+        >>> bbq.ai.generate_embedding(
+        ...     "project.dataset.model_name",
+        ...     df
+        ... ) # doctest: +SKIP
+
+    Args:
+        model_name (str):
+            The name of a remote model from Vertex AI, such as the
+            multimodalembedding@001 model.
+        data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
+            The data to generate embeddings for. If a Series is provided, it is
+            treated as the 'content' column.  If a DataFrame is provided, it
+            must contain a 'content' column, or you must rename the column you
+            wish to embed to 'content'.
+        output_dimensionality (int, optional):
+            An INT64 value that specifies the number of dimensions to use when
+            generating embeddings. For example, if you specify 256 AS
+            output_dimensionality, then the embedding output column contains a
+            256-dimensional embedding for each input value. To find the
+            supported range of output dimensions, read about the available
+            `Google text embedding models <https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models>`_.
+        task_type (str, optional):
+            A STRING literal that specifies the intended downstream application to
+            help the model produce better quality embeddings. For a list of
+            supported task types and how to choose which one to use, see `Choose an
+            embeddings task type <http://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types>`_.
+        start_second (float, optional):
+            The second in the video at which to start the embedding. The default value is 0.
+        end_second (float, optional):
+            The second in the video at which to end the embedding. The default value is 120.
+        interval_seconds (float, optional):
+            The interval to use when creating embeddings. The default value is 16.
+        trial_id (int, optional):
+            An INT64 value that identifies the hyperparameter tuning trial that
+            you want the function to evaluate. The function uses the optimal
+            trial by default. Only specify this argument if you ran
+            hyperparameter tuning when creating the model.
+
+    Returns:
+        bigframes.pandas.DataFrame:
+            A new DataFrame with the generated embeddings. See the `SQL
+            reference for AI.GENERATE_EMBEDDING
+            <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-embedding#output>`_
+            for details.
+    """
+    if isinstance(data, (pd.DataFrame, pd.Series)):
+        data = bpd.read_pandas(data)
+
+    if isinstance(data, series.Series):
+        data = data.copy()
+        data.name = "content"
+        data_df = data.to_frame()
+    elif isinstance(data, dataframe.DataFrame):
+        data_df = data
+    else:
+        raise ValueError(f"Unsupported data type: {type(data)}")
+
+    # We need to get the SQL for the input data to pass as a subquery to the TVF
+    source_sql = data_df.sql
+
+    struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {}
+    if output_dimensionality is not None:
+        struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality
+    if task_type is not None:
+        struct_fields["TASK_TYPE"] = task_type
+    if start_second is not None:
+        struct_fields["START_SECOND"] = start_second
+    if end_second is not None:
+        struct_fields["END_SECOND"] = end_second
+    if interval_seconds is not None:
+        struct_fields["INTERVAL_SECONDS"] = interval_seconds
+    if trial_id is not None:
+        struct_fields["TRIAL_ID"] = trial_id
+
+    # Construct the TVF query
+    query = f"""
+        SELECT *
+        FROM AI.GENERATE_EMBEDDING(
+            MODEL `{model_name}`,
+            ({source_sql}),
+            {bigframes.core.sql.literals.struct_literal(struct_fields)})
+        )
+    """
+
+    return data_df._session.read_gbq(query)
+
+
 @log_adapter.method_logger(custom_base_name="bigquery_ai")
 def if_(
     prompt: PROMPT_TYPE,

@@ -22,6 +22,7 @@
     generate,
     generate_bool,
     generate_double,
+    generate_embedding,
     generate_int,
     if_,
     score,
@@ -33,6 +34,7 @@
     "generate",
     "generate_bool",
     "generate_double",
+    "generate_embedding",
     "generate_int",
     "if_",
     "score",

@@ -0,0 +1,58 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import collections.abc
+import json
+from typing import Any, List, Mapping, Union
+
+import bigframes.core.sql
+
+STRUCT_VALUES = Union[
+    str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]
+]
+STRUCT_TYPE = Mapping[str, STRUCT_VALUES]
+
+
+def struct_literal(struct_options: STRUCT_TYPE) -> str:
+    rendered_options = []
+    for option_name, option_value in struct_options.items():
+        if option_name == "model_params":
+            json_str = json.dumps(option_value)
+            # Escape single quotes for SQL string literal
+            sql_json_str = json_str.replace("'", "''")
+            rendered_val = f"JSON'{sql_json_str}'"
+        elif isinstance(option_value, collections.abc.Mapping):
+            struct_body = ", ".join(
+                [
+                    f"{bigframes.core.sql.simple_literal(v)} AS {k}"
+                    for k, v in option_value.items()
+                ]
+            )
+            rendered_val = f"STRUCT({struct_body})"
+        elif isinstance(option_value, list):
+            rendered_val = (
+                "["
+                + ", ".join(
+                    [bigframes.core.sql.simple_literal(v) for v in option_value]
+                )
+                + "]"
+            )
+        elif isinstance(option_value, bool):
+            rendered_val = str(option_value).lower()
+        else:
+            rendered_val = bigframes.core.sql.simple_literal(option_value)
+        rendered_options.append(f"{rendered_val} AS {option_name}")
+    return f"STRUCT({', '.join(rendered_options)})"
@@ -14,12 +14,11 @@
 
 from __future__ import annotations
 
-import collections.abc
-import json
 from typing import Any, Dict, List, Mapping, Optional, Union
 
 import bigframes.core.compile.googlesql as googlesql
 import bigframes.core.sql
+import bigframes.core.sql.literals
 
 
 def create_model_ddl(
@@ -109,36 +108,7 @@ def _build_struct_sql(
 ) -> str:
     if not struct_options:
         return ""
-
-    rendered_options = []
-    for option_name, option_value in struct_options.items():
-        if option_name == "model_params":
-            json_str = json.dumps(option_value)
-            # Escape single quotes for SQL string literal
-            sql_json_str = json_str.replace("'", "''")
-            rendered_val = f"JSON'{sql_json_str}'"
-        elif isinstance(option_value, collections.abc.Mapping):
-            struct_body = ", ".join(
-                [
-                    f"{bigframes.core.sql.simple_literal(v)} AS {k}"
-                    for k, v in option_value.items()
-                ]
-            )
-            rendered_val = f"STRUCT({struct_body})"
-        elif isinstance(option_value, list):
-            rendered_val = (
-                "["
-                + ", ".join(
-                    [bigframes.core.sql.simple_literal(v) for v in option_value]
-                )
-                + "]"
-            )
-        elif isinstance(option_value, bool):
-            rendered_val = str(option_value).lower()
-        else:
-            rendered_val = bigframes.core.sql.simple_literal(option_value)
-        rendered_options.append(f"{rendered_val} AS {option_name}")
-    return f", STRUCT({', '.join(rendered_options)})"
+    return f", {bigframes.core.sql.literals.struct_literal(struct_options)}"
 
 
 def evaluate(

@@ -0,0 +1,134 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest import mock
+
+import pandas as pd
+import pytest
+
+import bigframes.bigquery as bbq
+import bigframes.dataframe
+import bigframes.series
+import bigframes.session
+
+
+@pytest.fixture
+def mock_session():
+    return mock.create_autospec(spec=bigframes.session.Session)
+
+
+@pytest.fixture
+def mock_dataframe(mock_session):
+    df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
+    df._session = mock_session
+    df.sql = "SELECT * FROM my_table"
+    return df
+
+
+@pytest.fixture
+def mock_series(mock_session):
+    series = mock.create_autospec(spec=bigframes.series.Series)
+    series._session = mock_session
+    # Mock to_frame to return a mock dataframe
+    df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
+    df._session = mock_session
+    df.sql = "SELECT my_col AS content FROM my_table"
+    series.copy.return_value = series
+    series.to_frame.return_value = df
+    return series
+
+
+def test_generate_embedding_with_dataframe(mock_dataframe, mock_session):
+    model_name = "project.dataset.model"
+
+    bbq.ai.generate_embedding(
+        model_name,
+        mock_dataframe,
+        output_dimensionality=256,
+    )
+
+    mock_session.read_gbq.assert_called_once()
+    query = mock_session.read_gbq.call_args[0][0]
+
+    # Normalize whitespace for comparison
+    query = " ".join(query.split())
+
+    expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING("
+    expected_part_2 = f"MODEL `{model_name}`,"
+    expected_part_3 = "(SELECT * FROM my_table),"
+    expected_part_4 = "STRUCT(256 AS OUTPUT_DIMENSIONALITY)"
+
+    assert expected_part_1 in query
+    assert expected_part_2 in query
+    assert expected_part_3 in query
+    assert expected_part_4 in query
+
+
+def test_generate_embedding_with_series(mock_series, mock_session):
+    model_name = "project.dataset.model"
+
+    bbq.ai.generate_embedding(
+        model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0
+    )
+
+    mock_session.read_gbq.assert_called_once()
+    query = mock_session.read_gbq.call_args[0][0]
+    query = " ".join(query.split())
+
+    assert f"MODEL `{model_name}`" in query
+    assert "(SELECT my_col AS content FROM my_table)" in query
+    assert (
+        "STRUCT(0.0 AS START_SECOND, 10.0 AS END_SECOND, 5.0 AS INTERVAL_SECONDS)"
+        in query
+    )
+
+
+def test_generate_embedding_defaults(mock_dataframe, mock_session):
+    model_name = "project.dataset.model"
+
+    bbq.ai.generate_embedding(
+        model_name,
+        mock_dataframe,
+    )
+
+    mock_session.read_gbq.assert_called_once()
+    query = mock_session.read_gbq.call_args[0][0]
+    query = " ".join(query.split())
+
+    assert f"MODEL `{model_name}`" in query
+    assert "STRUCT()" in query
+
+
+@mock.patch("bigframes.pandas.read_pandas")
+def test_generate_embedding_with_pandas_dataframe(
+    read_pandas_mock, mock_dataframe, mock_session
+):
+    # This tests that pandas input path works and calls read_pandas
+    model_name = "project.dataset.model"
+
+    # Mock return value of read_pandas to be a BigFrames DataFrame
+    read_pandas_mock.return_value = mock_dataframe
+
+    pandas_df = pd.DataFrame({"content": ["test"]})
+
+    bbq.ai.generate_embedding(
+        model_name,
+        pandas_df,
+    )
+
+    read_pandas_mock.assert_called_once()
+    # Check that read_pandas was called with something (the pandas df)
+    assert read_pandas_mock.call_args[0][0] is pandas_df
+
+    mock_session.read_gbq.assert_called_once()