fix write node

HaoXuAI · HaoXuAI · commit 3af968bb0e9c · 2025-04-22T00:08:47.000-07:00
Signed-off-by: HaoXuAI &lt;sduxuhao@gmail.com&gt;
diff --git a/sdk/python/feast/infra/compute_engines/spark/nodes.py b/sdk/python/feast/infra/compute_engines/spark/nodes.py
@@ -17,6 +17,9 @@
     SparkRetrievalJob,
     _get_entity_schema,
 )
+from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import (
+    SparkSource,
+)
 from feast.infra.offline_stores.offline_utils import (
     infer_event_timestamp_from_entity_df,
 )
@@ -282,12 +285,30 @@ def execute(self, context: ExecutionContext) -> DAGValue:
             feature_view=self.feature_view, repo_config=context.repo_config
         )
 
-        # ✅ 1. Write to online or offline store (if enabled)
-        if self.feature_view.online or self.feature_view.offline:
+        # ✅ 1. Write to online store if online enabled
+        if self.feature_view.online:
             spark_df.mapInArrow(
-                lambda x: map_in_arrow(x, serialized_artifacts), spark_df.schema
+                lambda x: map_in_arrow(x, serialized_artifacts, mode="online"),
+                spark_df.schema,
             ).count()
 
+        # ✅ 2. Write to offline store if offline enabled
+        if self.feature_view.offline:
+            if not isinstance(self.feature_view.batch_source, SparkSource):
+                spark_df.mapInArrow(
+                    lambda x: map_in_arrow(x, serialized_artifacts, mode="offline"),
+                    spark_df.schema,
+                ).count()
+            # Directly write spark df to spark offline store without using mapInArrow
+            else:
+                dest_path = self.feature_view.batch_source.path
+                file_format = self.feature_view.batch_source.file_format
+                if not dest_path or not file_format:
+                    raise ValueError(
+                        "Destination path and file format must be specified for SparkSource."
+                    )
+                spark_df.write.format(file_format).mode("append").save(dest_path)
+
         return DAGValue(
             data=spark_df,
             format=DAGFormat.SPARK,
diff --git a/sdk/python/feast/infra/compute_engines/spark/utils.py b/sdk/python/feast/infra/compute_engines/spark/utils.py
@@ -1,4 +1,4 @@
-from typing import Dict, Iterable, Optional
+from typing import Dict, Iterable, Literal, Optional
 
 import pyarrow as pa
 from pyspark import SparkConf
@@ -27,6 +27,7 @@ def get_or_create_new_spark_session(
 def map_in_arrow(
     iterator: Iterable[pa.RecordBatch],
     serialized_artifacts: "SerializedArtifacts",
+    mode: Literal["online", "offline"] = "online",
 ):
     for batch in iterator:
         table = pa.Table.from_batches([batch])
@@ -37,9 +38,8 @@ def map_in_arrow(
             offline_store,
             repo_config,
         ) = serialized_artifacts.unserialize()
-        print("write_feature_view", feature_view)
 
-        if feature_view.online:
+        if mode == "online":
             join_key_to_value_type = {
                 entity.name: entity.dtype.to_value_type()
                 for entity in feature_view.entity_columns
@@ -55,8 +55,7 @@ def map_in_arrow(
                 data=rows_to_write,
                 progress=lambda x: None,
             )
-        if feature_view.offline:
-            print("offline_to_write", table)
+        if mode == "offline":
             offline_store.offline_write_batch(
                 config=repo_config,
                 feature_view=feature_view,
diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py
@@ -3,7 +3,7 @@
 import uuid
 import warnings
 from datetime import datetime, timezone
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import pandas
@@ -54,6 +54,8 @@ class SparkOfflineStoreConfig(FeastConfigBaseModel):
     region: Optional[StrictStr] = None
     """ AWS Region if applicable for s3-based staging locations"""
 
+    mode: Optional[Literal["driver", "worker"]] = "driver"
+
 
 class SparkOfflineStore(OfflineStore):
     @staticmethod
@@ -218,6 +220,22 @@ def offline_write_batch(
         table: pyarrow.Table,
         progress: Optional[Callable[[int], Any]],
     ):
+        """
+        Write pyarrow table to offline store.
+        This method supports two execution modes:
+        - "driver": Uses Spark to perform schema validation, type casting, and appending the data to the offline store.
+                This mode must run on the Spark driver and supports advanced functionality like schema enforcement.
+        - "worker": A simplified, worker-safe implementation that writes Arrow tables directly to storage.
+                This mode is designed for distributed execution within mapInArrow or other parallel contexts.
+
+        Args:
+            config: RepoConfig
+            feature_view: FeatureView
+            table: pyarrow.Table
+            progress: Callable[[int], Any]
+            mode: Literal["driver", "worker"], default is "driver"
+
+        """
         assert isinstance(config.offline_store, SparkOfflineStoreConfig)
         assert isinstance(feature_view.batch_source, SparkSource)
 
@@ -230,38 +248,55 @@ def offline_write_batch(
                 f"The schema is expected to be {pa_schema} with the columns (in this exact order) to be {column_names}."
             )
 
-        spark_session = get_spark_session_or_start_new_with_repoconfig(
-            store_config=config.offline_store
-        )
+        mode = config.offline_store.mode
 
-        if feature_view.batch_source.path:
-            # write data to disk so that it can be loaded into spark (for preserving column types)
-            with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp_file:
-                print(tmp_file.name)
-                pq.write_table(table, tmp_file.name)
-
-                # load data
-                df_batch = spark_session.read.parquet(tmp_file.name)
-
-                # load existing data to get spark table schema
-                df_existing = spark_session.read.format(
-                    feature_view.batch_source.file_format
-                ).load(feature_view.batch_source.path)
-
-                # cast columns if applicable
-                df_batch = _cast_data_frame(df_batch, df_existing)
-
-                df_batch.write.format(feature_view.batch_source.file_format).mode(
-                    "append"
-                ).save(feature_view.batch_source.path)
-        elif feature_view.batch_source.query:
-            raise NotImplementedError(
-                "offline_write_batch not implemented for batch sources specified by query"
+        if mode == "driver":
+            spark_session = get_spark_session_or_start_new_with_repoconfig(
+                store_config=config.offline_store
             )
+
+            if feature_view.batch_source.path:
+                # write data to disk so that it can be loaded into spark (for preserving column types)
+                with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp_file:
+                    print(tmp_file.name)
+                    pq.write_table(table, tmp_file.name)
+
+                    # load data
+                    df_batch = spark_session.read.parquet(tmp_file.name)
+
+                    # load existing data to get spark table schema
+                    df_existing = spark_session.read.format(
+                        feature_view.batch_source.file_format
+                    ).load(feature_view.batch_source.path)
+
+                    # cast columns if applicable
+                    df_batch = _cast_data_frame(df_batch, df_existing)
+
+                    df_batch.write.format(feature_view.batch_source.file_format).mode(
+                        "append"
+                    ).save(feature_view.batch_source.path)
+            elif feature_view.batch_source.query:
+                raise NotImplementedError(
+                    "offline_write_batch not implemented for batch sources specified by query"
+                )
+            else:
+                raise NotImplementedError(
+                    "offline_write_batch not implemented for batch sources specified by a table"
+                )
+        elif mode == "worker":
+            # Safe worker-side Arrow write
+            if not feature_view.batch_source.path:
+                raise ValueError("Path is required for worker mode.")
+
+            unique_name = f"batch_{uuid.uuid4().hex}.parquet"
+            output_path = os.path.join(feature_view.batch_source.path, unique_name)
+
+            pq.write_table(table, output_path)
+
+            if progress:
+                progress(table.num_rows)
         else:
-            raise NotImplementedError(
-                "offline_write_batch not implemented for batch sources specified by a table"
-            )
+            raise ValueError(f"Unsupported mode: {mode}")
 
     @staticmethod
     def pull_all_from_table_or_query(
diff --git a/sdk/python/tests/integration/compute_engines/spark/test_compute.py b/sdk/python/tests/integration/compute_engines/spark/test_compute.py
@@ -271,7 +271,6 @@ def tqdm_builder(length):
             fs=fs,
             feature="driver_hourly_stats:conv_rate",
             entity_df=entity_df,
-            expected_value=1.6,
         )
     finally:
         spark_environment.teardown()
@@ -303,15 +302,12 @@ def _check_offline_features(
     fs,
     feature,
     entity_df,
-    expected_value,
 ):
     offline_df = fs.get_historical_features(
         entity_df=entity_df,
         features=[feature],
     ).to_df()
-
-    assert len(offline_df) == 2
-    assert offline_df["driver_id"].to_list() == [1001, 1002]
+    assert len(offline_df) == 4
 
 
 if __name__ == "__main__":