Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/python/feast/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ def materialize_incremental_command(ctx: click.Context, end_ts: str, views: List
"ikv",
"couchbase",
"milvus",
"ray",
],
case_sensitive=False,
),
Expand Down
12 changes: 7 additions & 5 deletions sdk/python/feast/infra/compute_engines/ray/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
RayDAGRetrievalJob,
RayMaterializationJob,
)
from feast.infra.compute_engines.ray.utils import write_to_online_store
from feast.infra.offline_stores.offline_store import RetrievalJob
from feast.infra.registry.base_registry import BaseRegistry

Expand Down Expand Up @@ -203,11 +204,12 @@ def _materialize_from_offline_store(
arrow_table = retrieval_job.to_arrow()

# Write to online store if enabled
if getattr(feature_view, "online", False):
# TODO: Implement proper online store writing with correct data format conversion
logger.debug(
"Online store writing not implemented yet for Ray compute engine"
)
write_to_online_store(
arrow_table=arrow_table,
feature_view=feature_view,
online_store=self.online_store,
repo_config=self.repo_config,
)

# Write to offline store if enabled (this handles sink_source automatically for derived views)
if getattr(feature_view, "offline", False):
Expand Down
102 changes: 43 additions & 59 deletions sdk/python/feast/infra/compute_engines/ray/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
from feast.infra.compute_engines.dag.node import DAGNode
from feast.infra.compute_engines.dag.value import DAGValue
from feast.infra.compute_engines.ray.config import RayComputeEngineConfig
from feast.infra.compute_engines.ray.utils import (
safe_batch_processor,
write_to_online_store,
)
from feast.infra.compute_engines.utils import create_offline_store_retrieval_job
from feast.infra.ray_shared_utils import (
apply_field_mapping,
Expand Down Expand Up @@ -149,9 +153,8 @@ def execute(self, context: ExecutionContext) -> DAGValue:
feature_df = feature_dataset.to_pandas()
feature_ref = ray.put(feature_df)

@safe_batch_processor
def join_with_aggregated_features(batch: pd.DataFrame) -> pd.DataFrame:
if batch.empty:
return batch
features = ray.get(feature_ref)
if join_keys:
result = pd.merge(
Expand Down Expand Up @@ -226,10 +229,9 @@ def execute(self, context: ExecutionContext) -> DAGValue:
input_value.assert_format(DAGFormat.RAY)
dataset: Dataset = input_value.data

@safe_batch_processor
def apply_filters(batch: pd.DataFrame) -> pd.DataFrame:
"""Apply TTL and custom filters to the batch."""
if batch.empty:
return batch

filtered_batch = batch.copy()

Expand Down Expand Up @@ -447,11 +449,9 @@ def execute(self, context: ExecutionContext) -> DAGValue:
input_value.assert_format(DAGFormat.RAY)
dataset: Dataset = input_value.data

@safe_batch_processor
def deduplicate_batch(batch: pd.DataFrame) -> pd.DataFrame:
"""Remove duplicates from the batch."""
if batch.empty:
return batch

# Get deduplication keys
join_keys = self.column_info.join_keys
timestamp_col = self.column_info.timestamp_column
Expand Down Expand Up @@ -518,27 +518,21 @@ def execute(self, context: ExecutionContext) -> DAGValue:
elif callable(self.transformation):
transformation_serialized = dill.dumps(self.transformation)

@safe_batch_processor
def apply_transformation_with_serialized_udf(
batch: pd.DataFrame,
) -> pd.DataFrame:
"""Apply the transformation using pre-serialized UDF."""
if batch.empty:
return batch

try:
if transformation_serialized:
transformation_func = dill.loads(transformation_serialized)
transformed_batch = transformation_func(batch)
else:
logger.warning(
"No serialized transformation available, returning original batch"
)
transformed_batch = batch
if transformation_serialized:
transformation_func = dill.loads(transformation_serialized)
transformed_batch = transformation_func(batch)
else:
logger.warning(
"No serialized transformation available, returning original batch"
)
transformed_batch = batch

return transformed_batch
except Exception as e:
logger.error(f"Transformation failed: {e}")
return batch
return transformed_batch

transformed_dataset = dataset.map_batches(
apply_transformation_with_serialized_udf, batch_format="pandas"
Expand Down Expand Up @@ -645,46 +639,36 @@ def execute(self, context: ExecutionContext) -> DAGValue:
feature_view=self.feature_view, repo_config=context.repo_config
)

@safe_batch_processor
def write_batch_with_serialized_artifacts(batch: pd.DataFrame) -> pd.DataFrame:
"""Write each batch using pre-serialized artifacts."""
if batch.empty:
return batch

try:
(
feature_view,
online_store,
offline_store,
repo_config,
) = serialized_artifacts.unserialize()

arrow_table = pa.Table.from_pandas(batch)

# Write to online store if enabled
if getattr(feature_view, "online", False):
# TODO: Implement proper online store writing with correct data format conversion
logger.debug(
"Online store writing not implemented yet for Ray compute engine"
)

# Write to offline store if enabled
if getattr(feature_view, "offline", False):
try:
offline_store.offline_write_batch(
config=repo_config,
feature_view=feature_view,
table=arrow_table,
progress=lambda x: None,
)
except Exception as e:
logger.error(f"Failed to write to offline store: {e}")
raise
(
feature_view,
online_store,
offline_store,
repo_config,
) = serialized_artifacts.unserialize()

arrow_table = pa.Table.from_pandas(batch)

# Write to online store if enabled
write_to_online_store(
arrow_table=arrow_table,
feature_view=feature_view,
online_store=online_store,
repo_config=repo_config,
)

return batch
# Write to offline store if enabled
if getattr(feature_view, "offline", False):
offline_store.offline_write_batch(
config=repo_config,
feature_view=feature_view,
table=arrow_table,
progress=lambda x: None,
)

except Exception as e:
logger.error(f"Write operation failed: {e}")
raise
return batch

written_dataset = dataset.map_batches(
write_batch_with_serialized_artifacts, batch_format="pandas"
Expand Down
93 changes: 93 additions & 0 deletions sdk/python/feast/infra/compute_engines/ray/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
Utility functions for Ray compute engine.
"""

import logging
from typing import Callable, Dict, Union

import pandas as pd
import pyarrow as pa

from feast.batch_feature_view import BatchFeatureView
from feast.feature_view import FeatureView
from feast.infra.online_stores.online_store import OnlineStore
from feast.repo_config import RepoConfig
from feast.stream_feature_view import StreamFeatureView
from feast.utils import _convert_arrow_to_proto
from feast.value_type import ValueType

logger = logging.getLogger(__name__)


def write_to_online_store(
arrow_table: pa.Table,
feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView],
online_store: OnlineStore,
repo_config: RepoConfig,
) -> None:
"""
Writes Arrow table data to the online store.

Args:
arrow_table: Arrow table containing the data to write
feature_view: Feature view being materialized
online_store: Online store instance
repo_config: Repository configuration
"""
if not getattr(feature_view, "online", False):
return

try:
join_key_to_value_type: Dict[str, ValueType] = {}
if hasattr(feature_view, "entity_columns") and feature_view.entity_columns:
join_key_to_value_type = {
entity.name: entity.dtype.to_value_type()
for entity in feature_view.entity_columns
}

rows_to_write = _convert_arrow_to_proto(
arrow_table, feature_view, join_key_to_value_type
)

if rows_to_write:
online_store.online_write_batch(
config=repo_config,
table=feature_view,
data=rows_to_write,
progress=lambda x: None,
)
logger.debug(
f"Successfully wrote {len(rows_to_write)} rows to online store for {feature_view.name}"
)
else:
logger.warning(f"No rows to write for {feature_view.name}")

except Exception as e:
logger.error(f"Failed to write to online store for {feature_view.name}: {e}")


def safe_batch_processor(
func: Callable[[pd.DataFrame], pd.DataFrame],
) -> Callable[[pd.DataFrame], pd.DataFrame]:
"""
Decorator for batch processing functions that handles empty batches and errors gracefully.

Args:
func: Function that processes a pandas DataFrame batch

Returns:
Wrapped function that handles empty batches and exceptions
"""

def wrapper(batch: pd.DataFrame) -> pd.DataFrame:
# Handle empty batches
if batch.empty:
return batch

try:
return func(batch)
except Exception as e:
logger.error(f"Batch processing failed in {func.__name__}: {e}")
return batch

return wrapper
41 changes: 41 additions & 0 deletions sdk/python/feast/templates/ray/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Feast Ray Template

This template demonstrates Feast's Ray integration, showcasing both the **Ray Offline Store** and **Ray Compute Engine** capabilities for distributed feature processing.

## What's Included

```
ray_template/
β”œβ”€β”€ feature_repo/
β”‚ β”œβ”€β”€ feature_store.yaml # Ray offline store + compute engine config
β”‚ β”œβ”€β”€ example_repo.py # Feature definitions with Ray optimizations
β”‚ β”œβ”€β”€ test_workflow.py # Demo script showing Ray capabilities
β”‚ └── data/ # Sample datasets (generated by bootstrap)
β”‚ β”œβ”€β”€ driver_stats.parquet
β”‚ └── customer_daily_profile.parquet
└── README.md # This file
```


## Getting Started

1. **Initialize the template**:
```bash
feast init -t ray my_ray_project
cd my_ray_project/feature_repo
```

2. **Install Ray dependencies**:
```bash
pip install feast[ray]
```

3. **Apply feature definitions**:
```bash
feast apply
```

4. **Run the demo**:
```bash
python test_workflow.py
```
Empty file.
Loading
Loading