Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions protos/feast/core/DataSource.proto
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ message DataSource {

// override AWS S3 storage endpoint with custom S3 endpoint
string s3_endpoint_override = 3;

// S3 URL style for path-based access (required for MinIO, LocalStack, etc.)
// Valid values: "path" or "vhost" (virtual-hosted style, default)
string s3_url_style = 4;
}

// Defines options for DataSource that sources features from a BigQuery Query
Expand Down
4 changes: 4 additions & 0 deletions protos/feast/core/FeatureService.proto
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ message LoggingConfig {

// column names to use for partitioning
repeated string partition_by = 3;

// S3 URL style for path-based access (required for MinIO, LocalStack, etc.)
// Valid values: "path" or "vhost" (virtual-hosted style, default)
string s3_url_style = 4;
}

message BigQueryDestination {
Expand Down
34 changes: 28 additions & 6 deletions sdk/python/feast/infra/offline_stores/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,29 @@
from feast.repo_config import FeastConfigBaseModel, RepoConfig


def _build_s3_storage_options(
endpoint_override: Optional[str] = None,
s3_url_style: Optional[str] = None,
) -> dict:
"""Build storage_options dict for S3 access, only including set values."""
storage_options = {}
if endpoint_override:
storage_options["AWS_ENDPOINT_URL"] = endpoint_override
if s3_url_style:
storage_options["AWS_S3_URL_STYLE"] = s3_url_style
return storage_options


def _read_data_source(data_source: DataSource, repo_path: str) -> Table:
assert isinstance(data_source, FileSource)

if isinstance(data_source.file_format, ParquetFormat):
return ibis.read_parquet(data_source.path)
elif isinstance(data_source.file_format, DeltaFormat):
storage_options = {
"AWS_ENDPOINT_URL": data_source.s3_endpoint_override,
}
storage_options = _build_s3_storage_options(
endpoint_override=data_source.s3_endpoint_override,
s3_url_style=data_source.s3_url_style,
)

return ibis.read_delta(data_source.path, storage_options=storage_options)

Expand Down Expand Up @@ -86,9 +100,10 @@ def _write_data_source(
new_table = pyarrow.concat_tables([table, prev_table])
ibis.memtable(new_table).to_parquet(file_options.uri)
elif isinstance(data_source.file_format, DeltaFormat):
storage_options = {
"AWS_ENDPOINT_URL": str(data_source.s3_endpoint_override),
}
storage_options = _build_s3_storage_options(
endpoint_override=data_source.s3_endpoint_override,
s3_url_style=data_source.s3_url_style,
)

if mode == "append":
from deltalake import DeltaTable
Expand Down Expand Up @@ -120,6 +135,10 @@ class DuckDBOfflineStoreConfig(FeastConfigBaseModel):

staging_location_endpoint_override: Optional[str] = None

# S3 URL style for path-based access (required for MinIO, LocalStack, etc.)
# Valid values: "path" or "vhost" (virtual-hosted style, default)
staging_location_s3_url_style: Optional[str] = None


class DuckDBOfflineStore(OfflineStore):
@staticmethod
Expand All @@ -146,6 +165,7 @@ def pull_latest_from_table_or_query(
data_source_writer=_write_data_source,
staging_location=config.offline_store.staging_location,
staging_location_endpoint_override=config.offline_store.staging_location_endpoint_override,
staging_location_s3_url_style=config.offline_store.staging_location_s3_url_style,
)

@staticmethod
Expand All @@ -170,6 +190,7 @@ def get_historical_features(
data_source_writer=_write_data_source,
staging_location=config.offline_store.staging_location,
staging_location_endpoint_override=config.offline_store.staging_location_endpoint_override,
staging_location_s3_url_style=config.offline_store.staging_location_s3_url_style,
)

@staticmethod
Expand All @@ -196,6 +217,7 @@ def pull_all_from_table_or_query(
data_source_writer=_write_data_source,
staging_location=config.offline_store.staging_location,
staging_location_endpoint_override=config.offline_store.staging_location_endpoint_override,
staging_location_s3_url_style=config.offline_store.staging_location_s3_url_style,
)

@staticmethod
Expand Down
31 changes: 31 additions & 0 deletions sdk/python/feast/infra/offline_stores/file_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(
created_timestamp_column: Optional[str] = "",
field_mapping: Optional[Dict[str, str]] = None,
s3_endpoint_override: Optional[str] = None,
s3_url_style: Optional[str] = None,
description: Optional[str] = "",
tags: Optional[Dict[str, str]] = None,
owner: Optional[str] = "",
Expand All @@ -61,6 +62,8 @@ def __init__(
field_mapping: A dictionary mapping of column names in this data source to feature names in a feature table
or view. Only used for feature columns, not entities or timestamp columns.
s3_endpoint_override (optional): Overrides AWS S3 enpoint with custom S3 storage
s3_url_style (optional): S3 URL style for path-based access. Valid values: "path" or "vhost".
Required for S3-compatible storage like MinIO or LocalStack.
description (optional): A human-readable description.
tags (optional): A dictionary of key-value pairs to store arbitrary metadata.
owner (optional): The owner of the file source, typically the email of the primary
Expand All @@ -76,6 +79,7 @@ def __init__(
file_format=file_format,
uri=path,
s3_endpoint_override=s3_endpoint_override,
s3_url_style=s3_url_style,
)

super().__init__(
Expand All @@ -102,6 +106,7 @@ def __eq__(self, other):
and self.file_options.file_format == other.file_options.file_format
and self.file_options.s3_endpoint_override
== other.file_options.s3_endpoint_override
and self.file_options.s3_url_style == other.file_options.s3_url_style
)

@property
Expand All @@ -119,6 +124,11 @@ def s3_endpoint_override(self) -> Optional[str]:
"""Returns the s3 endpoint override of this file data source."""
return self.file_options.s3_endpoint_override

@property
def s3_url_style(self) -> Optional[str]:
"""Returns the s3 URL style of this file data source."""
return self.file_options.s3_url_style

@staticmethod
def from_proto(data_source: DataSourceProto):
return FileSource(
Expand All @@ -129,6 +139,7 @@ def from_proto(data_source: DataSourceProto):
timestamp_field=data_source.timestamp_field,
created_timestamp_column=data_source.created_timestamp_column,
s3_endpoint_override=data_source.file_options.s3_endpoint_override,
s3_url_style=data_source.file_options.s3_url_style or None,
description=data_source.description,
tags=dict(data_source.tags),
owner=data_source.owner,
Expand Down Expand Up @@ -201,6 +212,8 @@ def get_table_column_names_and_types(
storage_options = {
"AWS_ENDPOINT_URL": str(self.s3_endpoint_override),
}
if self.s3_url_style:
storage_options["AWS_S3_URL_STYLE"] = self.s3_url_style

schema = (
DeltaTable(self.path, storage_options=storage_options)
Expand Down Expand Up @@ -235,18 +248,21 @@ class FileOptions:
Attributes:
uri: File source url, e.g. s3:// or local file.
s3_endpoint_override: Custom s3 endpoint (used only with s3 uri).
s3_url_style: S3 URL style for path-based access ("path" or "vhost").
file_format: File source format, e.g. parquet.
"""

uri: str
file_format: Optional[FileFormat]
s3_endpoint_override: str
s3_url_style: str

def __init__(
self,
uri: str,
file_format: Optional[FileFormat],
s3_endpoint_override: Optional[str],
s3_url_style: Optional[str] = None,
):
"""
Initializes a FileOptions object.
Expand All @@ -255,10 +271,12 @@ def __init__(
uri: File source url, e.g. s3:// or local file.
file_format (optional): File source format, e.g. parquet.
s3_endpoint_override (optional): Custom s3 endpoint (used only with s3 uri).
s3_url_style (optional): S3 URL style for path-based access ("path" or "vhost").
"""
self.uri = uri
self.file_format = file_format
self.s3_endpoint_override = s3_endpoint_override or ""
self.s3_url_style = s3_url_style or ""

@classmethod
def from_proto(cls, file_options_proto: DataSourceProto.FileOptions):
Expand All @@ -275,6 +293,7 @@ def from_proto(cls, file_options_proto: DataSourceProto.FileOptions):
file_format=FileFormat.from_proto(file_options_proto.file_format),
uri=file_options_proto.uri,
s3_endpoint_override=file_options_proto.s3_endpoint_override,
s3_url_style=file_options_proto.s3_url_style or None,
)
return file_options

Expand All @@ -291,6 +310,7 @@ def to_proto(self) -> DataSourceProto.FileOptions:
),
uri=self.uri,
s3_endpoint_override=self.s3_endpoint_override,
s3_url_style=self.s3_url_style,
)

return file_options_proto
Expand All @@ -306,10 +326,12 @@ def __init__(
path: str,
file_format: FileFormat = ParquetFormat(),
s3_endpoint_override: Optional[str] = None,
s3_url_style: Optional[str] = None,
):
self.file_options = FileOptions(
file_format=file_format,
s3_endpoint_override=s3_endpoint_override,
s3_url_style=s3_url_style,
uri=path,
)

Expand All @@ -320,6 +342,7 @@ def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage:
path=file_options.uri,
file_format=file_options.file_format,
s3_endpoint_override=file_options.s3_endpoint_override,
s3_url_style=file_options.s3_url_style or None,
)

def to_proto(self) -> SavedDatasetStorageProto:
Expand All @@ -330,6 +353,7 @@ def to_data_source(self) -> DataSource:
path=self.file_options.uri,
file_format=self.file_options.file_format,
s3_endpoint_override=self.file_options.s3_endpoint_override,
s3_url_style=self.file_options.s3_url_style or None,
)

@staticmethod
Expand All @@ -341,6 +365,7 @@ def from_data_source(data_source: DataSource) -> "SavedDatasetStorage":
if data_source.file_format
else ParquetFormat(),
s3_endpoint_override=data_source.s3_endpoint_override,
s3_url_style=data_source.s3_url_style,
)


Expand All @@ -349,24 +374,28 @@ class FileLoggingDestination(LoggingDestination):

path: str
s3_endpoint_override: str
s3_url_style: str
partition_by: Optional[List[str]]

def __init__(
self,
*,
path: str,
s3_endpoint_override="",
s3_url_style: str = "",
partition_by: Optional[List[str]] = None,
):
self.path = path
self.s3_endpoint_override = s3_endpoint_override
self.s3_url_style = s3_url_style
self.partition_by = partition_by

@classmethod
def from_proto(cls, config_proto: LoggingConfigProto) -> "LoggingDestination":
return FileLoggingDestination(
path=config_proto.file_destination.path,
s3_endpoint_override=config_proto.file_destination.s3_endpoint_override,
s3_url_style=config_proto.file_destination.s3_url_style or "",
partition_by=list(config_proto.file_destination.partition_by)
if config_proto.file_destination.partition_by
else None,
Expand All @@ -377,6 +406,7 @@ def to_proto(self) -> LoggingConfigProto:
file_destination=LoggingConfigProto.FileDestination(
path=self.path,
s3_endpoint_override=self.s3_endpoint_override,
s3_url_style=self.s3_url_style,
partition_by=self.partition_by,
)
)
Expand All @@ -386,4 +416,5 @@ def to_data_source(self) -> DataSource:
path=self.path,
file_format=ParquetFormat(),
s3_endpoint_override=self.s3_endpoint_override,
s3_url_style=self.s3_url_style or None,
)
14 changes: 13 additions & 1 deletion sdk/python/feast/infra/offline_stores/ibis.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def pull_latest_from_table_or_query_ibis(
data_source_writer: Callable[[pyarrow.Table, DataSource, str], None],
staging_location: Optional[str] = None,
staging_location_endpoint_override: Optional[str] = None,
staging_location_s3_url_style: Optional[str] = None,
) -> RetrievalJob:
fields = join_key_columns + feature_name_columns + [timestamp_field]
if created_timestamp_column:
Expand Down Expand Up @@ -87,6 +88,7 @@ def pull_latest_from_table_or_query_ibis(
data_source_writer=data_source_writer,
staging_location=staging_location,
staging_location_endpoint_override=staging_location_endpoint_override,
staging_location_s3_url_style=staging_location_s3_url_style,
repo_path=str(config.repo_path),
)

Expand Down Expand Up @@ -153,6 +155,7 @@ def get_historical_features_ibis(
full_feature_names: bool = False,
staging_location: Optional[str] = None,
staging_location_endpoint_override: Optional[str] = None,
staging_location_s3_url_style: Optional[str] = None,
event_expire_timestamp_fn=None,
) -> RetrievalJob:
entity_schema = _get_entity_schema(
Expand Down Expand Up @@ -250,6 +253,7 @@ def read_fv(
data_source_writer=data_source_writer,
staging_location=staging_location,
staging_location_endpoint_override=staging_location_endpoint_override,
staging_location_s3_url_style=staging_location_s3_url_style,
repo_path=str(config.repo_path),
)

Expand All @@ -267,6 +271,7 @@ def pull_all_from_table_or_query_ibis(
end_date: Optional[datetime] = None,
staging_location: Optional[str] = None,
staging_location_endpoint_override: Optional[str] = None,
staging_location_s3_url_style: Optional[str] = None,
) -> RetrievalJob:
timestamp_fields = [timestamp_field]
if created_timestamp_column:
Expand Down Expand Up @@ -304,6 +309,7 @@ def pull_all_from_table_or_query_ibis(
data_source_writer=data_source_writer,
staging_location=staging_location,
staging_location_endpoint_override=staging_location_endpoint_override,
staging_location_s3_url_style=staging_location_s3_url_style,
repo_path=str(config.repo_path),
)

Expand Down Expand Up @@ -487,6 +493,7 @@ def __init__(
staging_location,
staging_location_endpoint_override,
repo_path,
staging_location_s3_url_style=None,
) -> None:
super().__init__()
self.table = table
Expand All @@ -498,6 +505,7 @@ def __init__(
self.data_source_writer = data_source_writer
self.staging_location = staging_location
self.staging_location_endpoint_override = staging_location_endpoint_override
self.staging_location_s3_url_style = staging_location_s3_url_style
self.repo_path = repo_path

def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame:
Expand Down Expand Up @@ -538,7 +546,11 @@ def supports_remote_storage_export(self) -> bool:
def to_remote_storage(self) -> List[str]:
path = self.staging_location + f"/{str(uuid.uuid4())}"

storage_options = {"AWS_ENDPOINT_URL": self.staging_location_endpoint_override}
storage_options = {}
if self.staging_location_endpoint_override:
storage_options["AWS_ENDPOINT_URL"] = self.staging_location_endpoint_override
if self.staging_location_s3_url_style:
storage_options["AWS_S3_URL_STYLE"] = self.staging_location_s3_url_style

self.table.to_delta(path, storage_options=storage_options)

Expand Down
Loading
Loading