Skip to content

Commit d6c0b2d

Browse files
jbauer12Jonas Bauer
andauthored
feat: Adjust ray offline store to support abfs(s) ADLS Azure Storage (#5911)
* feat: adjust ray offline store to support abfs(s) ADLS Azure Storage Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> * Delete test_registry file. Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> * Feedback from PR Review. Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> * Added additional comment about env var. Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> * Fixed potential issue from PR. Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> * Fixed found issue by Devin from PR. Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> * Apply formatting Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> * Fixed found issue by Devin from PR. Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> --------- Signed-off-by: Jonas Bauer <jbauer@easy2parts.com> Co-authored-by: Jonas Bauer <jbauer@easy2parts.com>
1 parent 759d8c6 commit d6c0b2d

File tree

3 files changed

+30
-9
lines changed
  • docs/reference/offline-stores
  • sdk/python/feast/infra
    • compute_engines/ray
    • offline_stores/contrib/ray_offline_store

3 files changed

+30
-9
lines changed

docs/reference/offline-stores/ray.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ The template provides a complete working example with sample datasets and demons
2727
The Ray offline store provides:
2828
- Ray-based data reading from file sources (Parquet, CSV, etc.)
2929
- Support for local, remote, and KubeRay (Kubernetes-managed) clusters
30-
- Integration with various storage backends (local files, S3, GCS, HDFS)
30+
- Integration with various storage backends (local files, S3, GCS, HDFS, Azure Blob)
3131
- Efficient data filtering and column selection
3232
- Timestamp-based data processing with timezone awareness
3333
- Enterprise-ready KubeRay cluster support via CodeFlare SDK
@@ -463,6 +463,11 @@ job.persist(gcs_storage, allow_overwrite=True)
463463
# HDFS
464464
hdfs_storage = SavedDatasetFileStorage(path="hdfs://namenode:8020/datasets/driver_features.parquet")
465465
job.persist(hdfs_storage, allow_overwrite=True)
466+
467+
# Azure Blob Storage / Azure Data Lake Storage Gen2
468+
# By setting AZURE_STORAGE_ANON=False it uses DefaultAzureCredential
469+
az_storage = SavedDatasetFileStorage(path="abfss://container@stc_account.dfs.core.windows.net/datasets/driver_features.parquet")
470+
job.persist(az_storage, allow_overwrite=True)
466471
```
467472

468473
### Using Ray Cluster

sdk/python/feast/infra/compute_engines/ray/job.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
from feast.infra.compute_engines.dag.model import DAGFormat
1919
from feast.infra.compute_engines.dag.plan import ExecutionPlan
2020
from feast.infra.compute_engines.dag.value import DAGValue
21+
from feast.infra.offline_stores.contrib.ray_offline_store.ray import (
22+
REMOTE_STORAGE_SCHEMES,
23+
)
2124
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
2225
from feast.infra.offline_stores.offline_store import RetrievalJob, RetrievalMetadata
2326
from feast.infra.ray_initializer import get_ray_wrapper
@@ -205,7 +208,7 @@ def persist(
205208
destination_path = storage.file_options.uri
206209

207210
# Check if destination already exists
208-
if not destination_path.startswith(("s3://", "gs://", "hdfs://")):
211+
if not destination_path.startswith(REMOTE_STORAGE_SCHEMES):
209212
import os
210213

211214
if not allow_overwrite and os.path.exists(destination_path):

sdk/python/feast/infra/offline_stores/contrib/ray_offline_store/ray.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@
6262
from feast.utils import _get_column_names, make_df_tzaware, make_tzaware
6363

6464
logger = logging.getLogger(__name__)
65+
# Remote storage URI schemes supported by the Ray offline store
66+
# S3: Amazon S3
67+
# GCS: Google Cloud Storage
68+
# HDFS: Hadoop Distributed File System
69+
# Azure: Azure Storage Gen2
70+
REMOTE_STORAGE_SCHEMES = ("s3://", "gs://", "hdfs://", "abfs://", "abfss://")
6571

6672

6773
def _get_data_schema_info(
@@ -1160,13 +1166,13 @@ def persist(
11601166
f"Ray offline store only supports SavedDatasetFileStorage, got {type(storage)}"
11611167
)
11621168
destination_path = storage.file_options.uri
1163-
if not destination_path.startswith(("s3://", "gs://", "hdfs://")):
1169+
if not destination_path.startswith(REMOTE_STORAGE_SCHEMES):
11641170
if not allow_overwrite and os.path.exists(destination_path):
11651171
raise SavedDatasetLocationAlreadyExists(location=destination_path)
11661172
try:
11671173
ray_ds = self._get_ray_dataset()
11681174

1169-
if not destination_path.startswith(("s3://", "gs://", "hdfs://")):
1175+
if not destination_path.startswith(REMOTE_STORAGE_SCHEMES):
11701176
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
11711177

11721178
ray_ds.write_parquet(destination_path)
@@ -1956,12 +1962,19 @@ def normalize_timestamps(batch: pd.DataFrame) -> pd.DataFrame:
19561962
filesystem, resolved_path = FileSource.create_filesystem_and_path(
19571963
absolute_path, destination.s3_endpoint_override
19581964
)
1959-
path_obj = Path(resolved_path)
1960-
if path_obj.suffix == ".parquet":
1961-
path_obj = path_obj.with_suffix("")
1962-
if not absolute_path.startswith(("s3://", "gs://")):
1965+
if absolute_path.startswith(REMOTE_STORAGE_SCHEMES):
1966+
write_path = (
1967+
absolute_path[:-8]
1968+
if absolute_path.endswith(".parquet")
1969+
else absolute_path
1970+
)
1971+
else:
1972+
path_obj = Path(resolved_path)
1973+
if path_obj.suffix == ".parquet":
1974+
path_obj = path_obj.with_suffix("")
19631975
path_obj.mkdir(parents=True, exist_ok=True)
1964-
ds.write_parquet(str(path_obj))
1976+
write_path = str(path_obj)
1977+
ds.write_parquet(write_path)
19651978
except Exception as e:
19661979
raise RuntimeError(f"Failed to write logged features: {e}")
19671980

0 commit comments

Comments
 (0)