Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 86 additions & 1 deletion bigframes/bigquery/_operations/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from __future__ import annotations

import json
from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union
from typing import Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union

import pandas as pd

Expand Down Expand Up @@ -387,6 +387,91 @@ def generate_double(
return series_list[0]._apply_nary_op(operator, series_list[1:])


@log_adapter.method_logger(custom_base_name="bigquery_ai")
def generate_embedding(
model_name: str,
data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series],
*,
output_dimensionality: Optional[int] = None,
start_second: Optional[float] = None,
end_second: Optional[float] = None,
interval_seconds: Optional[float] = None,
) -> dataframe.DataFrame:
"""
Creates embeddings that describe an entity—for example, a piece of text or an image.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
>>> bbq.ai.generate_embedding(
... "project.dataset.model_name",
... df
... ) # doctest: +SKIP

Args:
model_name (str):
The name of a remote model over a Vertex AI multimodalembedding@001 model.
data (DataFrame or Series):
The data to generate embeddings for. If a Series is provided, it is treated as the 'content' column.
If a DataFrame is provided, it must contain a 'content' column, or you must rename the column you wish to embed to 'content'.
output_dimensionality (int, optional):
The number of dimensions to use when generating embeddings. Valid values are 128, 256, 512, and 1408. The default value is 1408.
start_second (float, optional):
The second in the video at which to start the embedding. The default value is 0.
end_second (float, optional):
The second in the video at which to end the embedding. The default value is 120.
interval_seconds (float, optional):
The interval to use when creating embeddings. The default value is 16.

Returns:
bigframes.dataframe.DataFrame:
A new DataFrame with the generated embeddings. It contains the input table columns and the following columns:
* "embedding": an ARRAY<FLOAT64> value that contains the generated embedding vector.
* "status": a STRING value that contains the API response status for the corresponding row.
* "video_start_sec": for video content, an INT64 value that contains the starting second.
* "video_end_sec": for video content, an INT64 value that contains the ending second.
"""
if isinstance(data, (pd.DataFrame, pd.Series)):
data = bpd.read_pandas(data)

if isinstance(data, series.Series):
# Rename series to 'content' and convert to DataFrame
data_df = data.rename("content").to_frame()
elif isinstance(data, dataframe.DataFrame):
data_df = data
else:
raise ValueError(f"Unsupported data type: {type(data)}")

# We need to get the SQL for the input data to pass as a subquery to the TVF
source_sql = data_df.sql

struct_fields = []
if output_dimensionality is not None:
struct_fields.append(f"{output_dimensionality} AS output_dimensionality")
if start_second is not None:
struct_fields.append(f"{start_second} AS start_second")
if end_second is not None:
struct_fields.append(f"{end_second} AS end_second")
if interval_seconds is not None:
struct_fields.append(f"{interval_seconds} AS interval_seconds")

struct_args = ", ".join(struct_fields)

# Construct the TVF query
query = f"""
SELECT *
FROM AI.GENERATE_EMBEDDING(
MODEL `{model_name}`,
({source_sql}),
STRUCT({struct_args})
)
"""

return data_df._session.read_gbq(query)


@log_adapter.method_logger(custom_base_name="bigquery_ai")
def if_(
prompt: PROMPT_TYPE,
Expand Down
135 changes: 135 additions & 0 deletions tests/unit/bigquery/test_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest import mock

import pandas as pd
import pytest

import bigframes.bigquery._operations.ai as ai_ops
import bigframes.dataframe
import bigframes.series
import bigframes.session


@pytest.fixture
def mock_session():
return mock.create_autospec(spec=bigframes.session.Session)


@pytest.fixture
def mock_dataframe(mock_session):
df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
df._session = mock_session
df.sql = "SELECT * FROM my_table"
return df


@pytest.fixture
def mock_series(mock_session):
s = mock.create_autospec(spec=bigframes.series.Series)
s._session = mock_session
# Mock to_frame to return a mock dataframe
df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
df._session = mock_session
df.sql = "SELECT my_col AS content FROM my_table"
s.rename.return_value.to_frame.return_value = df
return s


def test_generate_embedding_with_dataframe(mock_dataframe, mock_session):
model_name = "project.dataset.model"

ai_ops.generate_embedding(
model_name,
mock_dataframe,
output_dimensionality=256,
)

mock_session.read_gbq.assert_called_once()
query = mock_session.read_gbq.call_args[0][0]

# Normalize whitespace for comparison
query = " ".join(query.split())

expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING("
expected_part_2 = f"MODEL `{model_name}`,"
expected_part_3 = "(SELECT * FROM my_table),"
expected_part_4 = "STRUCT(256 AS output_dimensionality)"

assert expected_part_1 in query
assert expected_part_2 in query
assert expected_part_3 in query
assert expected_part_4 in query


def test_generate_embedding_with_series(mock_series, mock_session):
model_name = "project.dataset.model"

ai_ops.generate_embedding(
model_name,
mock_series,
start_second=0.0,
end_second=10.0,
interval_seconds=5.0
)

mock_series.rename.assert_called_with("content")
mock_series.rename.return_value.to_frame.assert_called_once()

mock_session.read_gbq.assert_called_once()
query = mock_session.read_gbq.call_args[0][0]
query = " ".join(query.split())

assert f"MODEL `{model_name}`" in query
assert "(SELECT my_col AS content FROM my_table)" in query
assert "STRUCT(0.0 AS start_second, 10.0 AS end_second, 5.0 AS interval_seconds)" in query


def test_generate_embedding_defaults(mock_dataframe, mock_session):
model_name = "project.dataset.model"

ai_ops.generate_embedding(
model_name,
mock_dataframe,
)

mock_session.read_gbq.assert_called_once()
query = mock_session.read_gbq.call_args[0][0]
query = " ".join(query.split())

assert f"MODEL `{model_name}`" in query
assert "STRUCT()" in query


@mock.patch("bigframes.pandas.read_pandas")
def test_generate_embedding_with_pandas_dataframe(read_pandas_mock, mock_dataframe, mock_session):
# This tests that pandas input path works and calls read_pandas
model_name = "project.dataset.model"

# Mock return value of read_pandas to be a BigFrames DataFrame
read_pandas_mock.return_value = mock_dataframe

pandas_df = pd.DataFrame({"content": ["test"]})

ai_ops.generate_embedding(
model_name,
pandas_df,
)

read_pandas_mock.assert_called_once()
# Check that read_pandas was called with something (the pandas df)
assert read_pandas_mock.call_args[0][0] is pandas_df

mock_session.read_gbq.assert_called_once()
Loading