Skip to content
7 changes: 4 additions & 3 deletions sdk/python/feast/feature_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,9 +1053,10 @@ def _get_feature_views_to_materialize(
f"Enable it before materializing."
)
if hasattr(feature_view, "online") and not feature_view.online:
raise ValueError(
f"FeatureView {feature_view.name} is not configured to be served online."
)
if not getattr(feature_view, "offline", False):
raise ValueError(
f"FeatureView {feature_view.name} is not configured to be served online."
)
Comment thread
abhijeet-dhumal marked this conversation as resolved.
elif (
hasattr(feature_view, "write_to_online_store")
and not feature_view.write_to_online_store
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,87 @@ class SparkFeatureViewQueryContext(offline_utils.FeatureViewQueryContext):
max_date_partition: str


def _apply_bfv_transformations_for_historical(

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think instead of writing new helper, better approach is to extend the existing _apply_bfv_transformations with a pre-computed-path shortcut (adding the "read from parquet if offline=True and path exists" branch at the top), rather than creating a parallel function that reimplements UDF execution.

spark_session: SparkSession,
feature_views: List[FeatureView],
query_context: List[offline_utils.FeatureViewQueryContext],
) -> List[offline_utils.FeatureViewQueryContext]:
"""
For BatchFeatureViews, redirect get_historical_features to read from the
pre-materialized offline store (batch_source.path) when available, avoiding
expensive UDF re-execution on raw data.

Precedence:
1. offline=True + batch_source.path set -> read pre-computed parquet
2. Python/pandas UDF present -> execute UDF on raw source (fallback)
3. Otherwise -> pass through unchanged
"""
from dataclasses import replace

fv_by_name = {fv.projection.name_to_use(): fv for fv in feature_views}
new_contexts = []

for ctx in query_context:
fv = fv_by_name.get(ctx.name)
if fv is None or not isinstance(fv, BatchFeatureView):
new_contexts.append(ctx)
continue

if (
getattr(fv, "offline", False)
and isinstance(fv.batch_source, SparkSource)
and fv.batch_source.path
):
tmp_view = f"__feast_offline_{ctx.name}_{uuid.uuid4().hex[:8]}"
file_format = fv.batch_source.file_format or "parquet"
try:
df = spark_session.read.format(file_format).load(fv.batch_source.path)
df.createOrReplaceTempView(tmp_view)
ctx = replace(ctx, table_subquery=tmp_view)
new_contexts.append(ctx)
Comment on lines +114 to +119

@jyejare jyejare Jun 1, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Critical] Path traversal vulnerability in file loading

The code directly loads files from fv.batch_source.path without any validation. This creates a path traversal security vulnerability where malicious paths could access unauthorized files on the system.

Suggested:

Suggested change
file_format = fv.batch_source.file_format or "parquet"
try:
df = spark_session.read.format(file_format).load(fv.batch_source.path)
df.createOrReplaceTempView(tmp_view)
ctx = replace(ctx, table_subquery=tmp_view)
new_contexts.append(ctx)
+ # Validate and sanitize the path
+ import os
+ normalized_path = os.path.normpath(fv.batch_source.path)
+ if '..' in normalized_path or normalized_path.startswith('/'):
+ warnings.warn(f"Invalid path '{fv.batch_source.path}' for '{ctx.name}'", RuntimeWarning)
+ new_contexts.append(ctx)
+ continue
+ try:
+ df = spark_session.read.format(file_format).load(normalized_path)
+ df.createOrReplaceTempView(tmp_view)
+ ctx = replace(ctx, table_subquery=tmp_view)
+ new_contexts.append(ctx)
+ continue

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for flagging this. fv.batch_source.path is a trusted configuration value set by the feature store admin at definition time — it's not runtime user input flowing from an API boundary, so path traversal in the traditional web security sense doesn't apply here.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, the suggested fix using os.path.normpath() would silently corrupt S3/GCS URIs (s3://bucket/path - s3:/bucket/path) since normpath collapses double slashes. Happy to add a lightweight guard for local paths only if you'd like, but I'd keep it separate from object storage paths.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make Sense.

continue
except (FileNotFoundError, PermissionError) as e:
warnings.warn(
f"Offline path '{fv.batch_source.path}' not accessible for "
f"'{ctx.name}': {e}; falling back to source query.",
RuntimeWarning,
stacklevel=2,
)
except Exception as e:
warnings.warn(
f"Unexpected error loading offline path '{fv.batch_source.path}' "
f"for '{ctx.name}': {e}; falling back to source query.",
RuntimeWarning,
Comment on lines +120 to +132

@jyejare jyejare Jun 1, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Warning] Overly broad exception handling masks specific errors

Catching all exceptions with 'except Exception:' is too broad and could mask important errors like permission issues, file corruption, or configuration problems. This makes debugging difficult and could hide security issues.
Suggested:

Suggested change
continue
except Exception:
warnings.warn(
f"Offline path '{fv.batch_source.path}' not readable for "
f"'{ctx.name}'; falling back to source query.",
RuntimeWarning,
+ except (FileNotFoundError, PermissionError) as e:
+ warnings.warn(
+ f"Offline path '{fv.batch_source.path}' not accessible for "
+ f"'{ctx.name}': {str(e)}; falling back to source query.",
+ RuntimeWarning,
+ stacklevel=2,
+ )
+ except Exception as e:
+ # Log unexpected errors but continue with fallback
+ import logging
+ logging.warning(f"Unexpected error loading '{fv.batch_source.path}': {e}")
+ warnings.warn(
+ f"Failed to load offline path for '{ctx.name}'; falling back to source query.",
+ RuntimeWarning,
+ stacklevel=2,
+ )

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch — updated to catch FileNotFoundError and PermissionError explicitly for the expected fallback cases, with a separate except Exception block emitting a distinct warning so unexpected errors aren't silently swallowed.

stacklevel=2,
)

if (
hasattr(fv, "feature_transformation")

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use has_transformation() and get_transformation_function() instead

and fv.feature_transformation is not None
and (
getattr(fv.feature_transformation, "mode", None) in ("python", "pandas")
or getattr(
getattr(fv.feature_transformation, "mode", None), "value", None
)
in ("python", "pandas")
)
):
udf = getattr(fv.feature_transformation, "udf", None) or getattr(
fv, "udf", None
)
if udf is not None:
temp_view_name = f"__feast_bfv_{ctx.name}_{uuid.uuid4().hex[:8]}"
spark_session.conf.set("spark.sql.runSQLOnFiles", "true")
raw_df = spark_session.sql(f"SELECT * FROM {ctx.table_subquery}")
Comment on lines +150 to +153

@jyejare jyejare Jun 1, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Warning] UDF execution without sandboxing poses security risks

Executing user-defined functions without proper sandboxing or validation could allow arbitrary code execution. This is a significant security risk in production environments.

Suggested:

Suggested change
if udf is not None:
temp_view_name = f"__feast_bfv_{ctx.name}_{uuid.uuid4().hex[:8]}"
spark_session.conf.set("spark.sql.runSQLOnFiles", "true")
raw_df = spark_session.sql(f"SELECT * FROM {ctx.table_subquery}")
+ # Add UDF validation and execution controls
+ if not hasattr(udf, '__call__'):
+ raise ValueError(f"Invalid UDF for {ctx.name}")
+ raw_df = spark_session.sql(f"SELECT * FROM {ctx.table_subquery}")
+ # Consider adding timeout and resource limits here
+ transformed_df = udf(raw_df)
+ transformed_df.createOrReplaceTempView(temp_view_name)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The UDF here is feature_transformation.udf registered in the Feast registry by the ML engineer at feature view definition time — same function that runs during materialize(). It's not arbitrary user input. The hasattr(udf, '__call__') check doesn't add a meaningful boundary since any Python object can implement __call__. Executor sandboxing is a cluster-level concern (Spark resource limits, executor isolation) — outside Feast's scope.

transformed_df = udf(raw_df)
transformed_df.createOrReplaceTempView(temp_view_name)
ctx = replace(ctx, table_subquery=temp_view_name)

new_contexts.append(ctx)

return new_contexts


class SparkOfflineStore(OfflineStore):
@staticmethod
def pull_latest_from_table_or_query(
Expand Down Expand Up @@ -261,8 +342,10 @@ def get_historical_features(
entity_df_event_timestamp_range,
)

query_context = _apply_bfv_transformations(
spark_session, feature_views, query_context
query_context = _apply_bfv_transformations_for_historical(

@ntkathole ntkathole Jun 3, 2026

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so previous _apply_bfv_transformations helper not removed from code?

spark_session=spark_session,
feature_views=feature_views,
query_context=query_context,
)

spark_query_context = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,11 +287,17 @@ def __init__(
date_partition_column_format: Optional[str] = "%Y-%m-%d",
table_format: Optional[TableFormat] = None,
):
# Check that only one of the ways to load a spark dataframe can be used. We have
# to treat empty string and null the same due to proto (de)serialization.
if sum([(not (not arg)) for arg in [table, query, path]]) != 1:
# query + path is allowed: query for reads during materialization,
# path for offline write-back (offline=True) and get_historical_features.
# table must be standalone (cannot combine with query or path).
has_table = bool(table)
has_query = bool(query)
has_path = bool(path)
if has_table and (has_query or has_path):
raise ValueError("'table' cannot be combined with 'query' or 'path'.")
if not (has_table or has_query or has_path):
raise ValueError(
"Exactly one of params(table, query, path) must be specified."
"At least one of params(table, query, path) must be specified."
)
if path:
# If table_format is specified, file_format is optional (table format determines the reader)
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/tests/unit/test_feature_server_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def test_faster_than_message_to_dict(self):
print(f"\nPerformance: fast={fast_time:.3f}s, standard={standard_time:.3f}s")
print(f"Speedup: {speedup:.2f}x")

assert speedup >= 1.5, f"Expected at least 1.5x speedup, got {speedup:.2f}x"
assert speedup >= 1.2, f"Expected at least 1.2x speedup, got {speedup:.2f}x"


class TestStatusNames:
Expand Down
Loading