Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions sdk/python/feast/infra/compute_engines/feature_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,14 @@ def get_column_info(
# we need to read ALL source columns, not just the output feature columns.
# This is specifically for transformations that create new columns or need raw data.
mode = getattr(getattr(view, "feature_transformation", None), "mode", None)
if mode in ("ray", "pandas") or getattr(mode, "value", None) in (
if mode in ("ray", "pandas", "python") or getattr(mode, "value", None) in (
"ray",
"pandas",
"python",
):
# Signal to read all columns by passing empty list for feature_cols
# The transformation will produce the output columns defined in the schema
# Signal to read all columns by passing empty list for feature_cols.
# "python" (BatchFeatureView) transformations need all raw source columns — the
# UDF computes output features from raw input, not from pre-existing feature cols.
feature_cols = []

return ColumnInfo(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch

import pytest

from feast.data_source import DataSource
from feast.infra.compute_engines.dag.context import ExecutionContext
Expand All @@ -7,6 +9,7 @@
from feast.infra.compute_engines.dag.plan import ExecutionPlan
from feast.infra.compute_engines.dag.value import DAGValue
from feast.infra.compute_engines.feature_builder import FeatureBuilder
from feast.transformation.mode import TransformationMode

# ---------------------------
# Minimal Mock DAGNode for testing
Expand Down Expand Up @@ -143,3 +146,144 @@ def test_recursive_featureview_build():
- Source(hourly_driver_stats)"""

assert execution_plan.to_dag() == expected_output


# ---------------------------------------------------------------------------
# Helpers for get_column_info tests
# ---------------------------------------------------------------------------

# Stable return value for _get_column_names: (join_keys, feature_cols, ts_col, created_ts_col)
_MOCK_COLUMN_NAMES = (
["user_id"],
["user_avg_rating", "user_review_count"],
"event_timestamp",
None,
)


def _make_transformation(mode):
"""Return a minimal transformation stub with the given mode."""
t = MagicMock()
t.mode = mode
return t


def _make_builder_for_column_info(transformation):
"""
Build a MockFeatureBuilder whose task.feature_view carries the given
transformation. registry.get_entity is stubbed out per entity name.
"""
view = MagicMock()
view.entities = ["user"]
view.feature_transformation = transformation
view.batch_source = MagicMock()
view.batch_source.field_mapping = {}
view.stream_source = None

task = MagicMock()
task.project = "test_project"
task.feature_view = view
task.only_latest = False

registry = MagicMock()
registry.get_entity.return_value = MagicMock(join_key="user_id")

builder = MockFeatureBuilder.__new__(MockFeatureBuilder)
builder.registry = registry
builder.task = task
builder.nodes = []
return builder, view


# ---------------------------------------------------------------------------
# Bug fix: TransformationMode.PYTHON must set feature_cols=[]
#
# Previously only "ray" and "pandas" were handled. "python" (the default mode
# for @batch_feature_view) was missing, causing get_column_info to forward
# the BFV *output* feature names (e.g. user_avg_rating) to the offline store
# read step — columns that don't exist in raw source data — resulting in
# UNRESOLVED_COLUMN errors at Spark analysis time.
# ---------------------------------------------------------------------------


@pytest.mark.parametrize(
"mode",
[
TransformationMode.PYTHON,
TransformationMode.PANDAS,
TransformationMode.RAY,
# String forms (getattr(mode, "value", None) path)
"python",
"pandas",
"ray",
],
)
def test_get_column_info_clears_feature_cols_for_udf_modes(mode):
"""
For transformation modes that compute output features from raw input
(python, pandas, ray), get_column_info must set feature_cols=[] so the
offline store read step issues SELECT * instead of projecting the output
feature names that don't exist in the raw source schema.
"""
builder, view = _make_builder_for_column_info(_make_transformation(mode))

with patch(
"feast.infra.compute_engines.feature_builder._get_column_names",
return_value=_MOCK_COLUMN_NAMES,
):
col_info = builder.get_column_info(view)

assert col_info.feature_cols == [], (
f"Expected feature_cols=[] for TransformationMode {mode!r}, "
f"got {col_info.feature_cols!r}. "
"The offline store read step must not project output feature names "
"that don't exist in the raw source schema."
)
assert col_info.join_keys == ["user_id"]
assert col_info.ts_col == "event_timestamp"


@pytest.mark.parametrize(
"mode",
[
TransformationMode.SPARK_SQL,
TransformationMode.SQL,
TransformationMode.SPARK,
"spark_sql",
"sql",
],
)
def test_get_column_info_preserves_feature_cols_for_non_udf_modes(mode):
"""
SQL/Spark-SQL transformations operate on already-projected columns and
should NOT get feature_cols cleared — the source read must still select
the named feature columns explicitly.
"""
builder, view = _make_builder_for_column_info(_make_transformation(mode))

with patch(
"feast.infra.compute_engines.feature_builder._get_column_names",
return_value=_MOCK_COLUMN_NAMES,
):
col_info = builder.get_column_info(view)

assert col_info.feature_cols == ["user_avg_rating", "user_review_count"], (
f"Expected feature_cols to be preserved for mode {mode!r}, "
f"got {col_info.feature_cols!r}."
)


def test_get_column_info_preserves_feature_cols_with_no_transformation():
"""
A plain FeatureView (no transformation) must retain its feature column
names so the offline store read step selects only the required columns.
"""
builder, view = _make_builder_for_column_info(None)

with patch(
"feast.infra.compute_engines.feature_builder._get_column_names",
return_value=_MOCK_COLUMN_NAMES,
):
col_info = builder.get_column_info(view)

assert col_info.feature_cols == ["user_avg_rating", "user_review_count"]
Loading