feast-dev · abhijeet-dhumal · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 27, 2026
@@ -1053,9 +1053,10 @@ def _get_feature_views_to_materialize(
                         f"Enable it before materializing."
                     )
                 if hasattr(feature_view, "online") and not feature_view.online:
-                    raise ValueError(
-                        f"FeatureView {feature_view.name} is not configured to be served online."
-                    )
+                    if not getattr(feature_view, "offline", False):
+                        raise ValueError(
+                            f"FeatureView {feature_view.name} is not configured to be served online."
+                        )
                 elif (
                     hasattr(feature_view, "write_to_online_store")
                     and not feature_view.write_to_online_store

@@ -79,6 +79,87 @@ class SparkFeatureViewQueryContext(offline_utils.FeatureViewQueryContext):
     max_date_partition: str
 
 
+def _apply_bfv_transformations_for_historical(
+    spark_session: SparkSession,
+    feature_views: List[FeatureView],
+    query_context: List[offline_utils.FeatureViewQueryContext],
+) -> List[offline_utils.FeatureViewQueryContext]:
+    """
+    For BatchFeatureViews, redirect get_historical_features to read from the
+    pre-materialized offline store (batch_source.path) when available, avoiding
+    expensive UDF re-execution on raw data.
+
+    Precedence:
+      1. offline=True + batch_source.path set -> read pre-computed parquet
+      2. Python/pandas UDF present -> execute UDF on raw source (fallback)
+      3. Otherwise -> pass through unchanged
+    """
+    from dataclasses import replace
+
+    fv_by_name = {fv.projection.name_to_use(): fv for fv in feature_views}
+    new_contexts = []
+
+    for ctx in query_context:
+        fv = fv_by_name.get(ctx.name)
+        if fv is None or not isinstance(fv, BatchFeatureView):
+            new_contexts.append(ctx)
+            continue
+
+        if (
+            getattr(fv, "offline", False)
+            and isinstance(fv.batch_source, SparkSource)
+            and fv.batch_source.path
+        ):
+            tmp_view = f"__feast_offline_{ctx.name}_{uuid.uuid4().hex[:8]}"
+            file_format = fv.batch_source.file_format or "parquet"
+            try:
+                df = spark_session.read.format(file_format).load(fv.batch_source.path)
+                df.createOrReplaceTempView(tmp_view)
+                ctx = replace(ctx, table_subquery=tmp_view)
+                new_contexts.append(ctx)
-            file_format = fv.batch_source.file_format or "parquet"
-            try:
-                df = spark_session.read.format(file_format).load(fv.batch_source.path)
-                df.createOrReplaceTempView(tmp_view)
-                ctx = replace(ctx, table_subquery=tmp_view)
-                new_contexts.append(ctx)
+            # Validate and sanitize the path
+            import os
+            normalized_path = os.path.normpath(fv.batch_source.path)
+            if '..' in normalized_path or normalized_path.startswith('/'):
+                warnings.warn(f"Invalid path '{fv.batch_source.path}' for '{ctx.name}'", RuntimeWarning)
+                new_contexts.append(ctx)
+                continue
+            try:
+                df = spark_session.read.format(file_format).load(normalized_path)
+                df.createOrReplaceTempView(tmp_view)
+                ctx = replace(ctx, table_subquery=tmp_view)
+                new_contexts.append(ctx)
+                continue
-            file_format = fv.batch_source.file_format or "parquet"
-            try:
-                df = spark_session.read.format(file_format).load(fv.batch_source.path)
-                df.createOrReplaceTempView(tmp_view)
-                ctx = replace(ctx, table_subquery=tmp_view)
-                new_contexts.append(ctx)
+            # Validate and sanitize the path
+            import os
+            normalized_path = os.path.normpath(fv.batch_source.path)
+            if '..' in normalized_path or normalized_path.startswith('/'):
+                warnings.warn(f"Invalid path '{fv.batch_source.path}' for '{ctx.name}'", RuntimeWarning)
+                new_contexts.append(ctx)
+                continue
+            try:
+                df = spark_session.read.format(file_format).load(normalized_path)
+                df.createOrReplaceTempView(tmp_view)
+                ctx = replace(ctx, table_subquery=tmp_view)
+                new_contexts.append(ctx)
+                continue
+                continue
+            except (FileNotFoundError, PermissionError) as e:
+                warnings.warn(
+                    f"Offline path '{fv.batch_source.path}' not accessible for "
+                    f"'{ctx.name}': {e}; falling back to source query.",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+            except Exception as e:
+                warnings.warn(
+                    f"Unexpected error loading offline path '{fv.batch_source.path}' "
+                    f"for '{ctx.name}': {e}; falling back to source query.",
+                    RuntimeWarning,
-                continue
-            except Exception:
-                warnings.warn(
-                    f"Offline path '{fv.batch_source.path}' not readable for "
-                    f"'{ctx.name}'; falling back to source query.",
-                    RuntimeWarning,
+            except (FileNotFoundError, PermissionError) as e:
+                warnings.warn(
+                    f"Offline path '{fv.batch_source.path}' not accessible for "
+                    f"'{ctx.name}': {str(e)}; falling back to source query.",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+            except Exception as e:
+                # Log unexpected errors but continue with fallback
+                import logging
+                logging.warning(f"Unexpected error loading '{fv.batch_source.path}': {e}")
+                warnings.warn(
+                    f"Failed to load offline path for '{ctx.name}'; falling back to source query.",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
-                continue
-            except Exception:
-                warnings.warn(
-                    f"Offline path '{fv.batch_source.path}' not readable for "
-                    f"'{ctx.name}'; falling back to source query.",
-                    RuntimeWarning,
+            except (FileNotFoundError, PermissionError) as e:
+                warnings.warn(
+                    f"Offline path '{fv.batch_source.path}' not accessible for "
+                    f"'{ctx.name}': {str(e)}; falling back to source query.",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+            except Exception as e:
+                # Log unexpected errors but continue with fallback
+                import logging
+                logging.warning(f"Unexpected error loading '{fv.batch_source.path}': {e}")
+                warnings.warn(
+                    f"Failed to load offline path for '{ctx.name}'; falling back to source query.",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+                    stacklevel=2,
+                )
+
+        if (
+            hasattr(fv, "feature_transformation")
+            and fv.feature_transformation is not None
+            and (
+                getattr(fv.feature_transformation, "mode", None) in ("python", "pandas")
+                or getattr(
+                    getattr(fv.feature_transformation, "mode", None), "value", None
+                )
+                in ("python", "pandas")
+            )
+        ):
+            udf = getattr(fv.feature_transformation, "udf", None) or getattr(
+                fv, "udf", None
+            )
+            if udf is not None:
+                temp_view_name = f"__feast_bfv_{ctx.name}_{uuid.uuid4().hex[:8]}"
+                spark_session.conf.set("spark.sql.runSQLOnFiles", "true")
+                raw_df = spark_session.sql(f"SELECT * FROM {ctx.table_subquery}")
-            if udf is not None:
-                temp_view_name = f"__feast_bfv_{ctx.name}_{uuid.uuid4().hex[:8]}"
-                spark_session.conf.set("spark.sql.runSQLOnFiles", "true")
-                raw_df = spark_session.sql(f"SELECT * FROM {ctx.table_subquery}")
+                # Add UDF validation and execution controls
+                if not hasattr(udf, '__call__'):
+                    raise ValueError(f"Invalid UDF for {ctx.name}")
+                raw_df = spark_session.sql(f"SELECT * FROM {ctx.table_subquery}")
+                # Consider adding timeout and resource limits here
+                transformed_df = udf(raw_df)
+                transformed_df.createOrReplaceTempView(temp_view_name)
-            if udf is not None:
-                temp_view_name = f"__feast_bfv_{ctx.name}_{uuid.uuid4().hex[:8]}"
-                spark_session.conf.set("spark.sql.runSQLOnFiles", "true")
-                raw_df = spark_session.sql(f"SELECT * FROM {ctx.table_subquery}")
+                # Add UDF validation and execution controls
+                if not hasattr(udf, '__call__'):
+                    raise ValueError(f"Invalid UDF for {ctx.name}")
+                raw_df = spark_session.sql(f"SELECT * FROM {ctx.table_subquery}")
+                # Consider adding timeout and resource limits here
+                transformed_df = udf(raw_df)
+                transformed_df.createOrReplaceTempView(temp_view_name)
+                transformed_df = udf(raw_df)
+                transformed_df.createOrReplaceTempView(temp_view_name)
+                ctx = replace(ctx, table_subquery=temp_view_name)
+
+        new_contexts.append(ctx)
+
+    return new_contexts
+
+
 class SparkOfflineStore(OfflineStore):
     @staticmethod
     def pull_latest_from_table_or_query(
@@ -261,8 +342,10 @@ def get_historical_features(
             entity_df_event_timestamp_range,
         )
 
-        query_context = _apply_bfv_transformations(
-            spark_session, feature_views, query_context
+        query_context = _apply_bfv_transformations_for_historical(
+            spark_session=spark_session,
+            feature_views=feature_views,
+            query_context=query_context,
         )
 
         spark_query_context = [

@@ -287,11 +287,17 @@ def __init__(
         date_partition_column_format: Optional[str] = "%Y-%m-%d",
         table_format: Optional[TableFormat] = None,
     ):
-        # Check that only one of the ways to load a spark dataframe can be used. We have
-        # to treat empty string and null the same due to proto (de)serialization.
-        if sum([(not (not arg)) for arg in [table, query, path]]) != 1:
+        # query + path is allowed: query for reads during materialization,
+        # path for offline write-back (offline=True) and get_historical_features.
+        # table must be standalone (cannot combine with query or path).
+        has_table = bool(table)
+        has_query = bool(query)
+        has_path = bool(path)
+        if has_table and (has_query or has_path):
+            raise ValueError("'table' cannot be combined with 'query' or 'path'.")
+        if not (has_table or has_query or has_path):
             raise ValueError(
-                "Exactly one of params(table, query, path) must be specified."
+                "At least one of params(table, query, path) must be specified."
             )
         if path:
             # If table_format is specified, file_format is optional (table format determines the reader)

@@ -655,7 +655,7 @@ def test_faster_than_message_to_dict(self):
         print(f"\nPerformance: fast={fast_time:.3f}s, standard={standard_time:.3f}s")
         print(f"Speedup: {speedup:.2f}x")
 
-        assert speedup >= 1.5, f"Expected at least 1.5x speedup, got {speedup:.2f}x"
+        assert speedup >= 1.2, f"Expected at least 1.2x speedup, got {speedup:.2f}x"
 
 
 class TestStatusNames: