Skip to content

Commit 8ae5ed5

Browse files
committed
add support for table format such as Iceberg, Delta, Hudi etc.
Signed-off-by: HaoXuAI <sduxuhao@gmail.com>
1 parent 3364bad commit 8ae5ed5

6 files changed

Lines changed: 358 additions & 29 deletions

File tree

protos/feast/core/DataSource.proto

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ message DataSource {
3636
reserved 6 to 10;
3737

3838
// Type of Data Source.
39-
// Next available id: 12
39+
// Next available id: 13
4040
enum SourceType {
4141
INVALID = 0;
4242
BATCH_FILE = 1;
@@ -231,6 +231,9 @@ message DataSource {
231231

232232
// Date Format of date partition column (e.g. %Y-%m-%d)
233233
string date_partition_column_format = 5;
234+
235+
// Table Format (e.g. iceberg, delta, etc)
236+
string table_format = 6;
234237
}
235238

236239
// Defines configuration for custom third-party data sources.

sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py

Lines changed: 61 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import logging
23
import traceback
34
import warnings
@@ -14,17 +15,17 @@
1415
)
1516
from feast.repo_config import RepoConfig
1617
from feast.saved_dataset import SavedDatasetStorage
18+
from feast.table_format import TableFormat, TableFormatType, table_format_from_dict
1719
from feast.type_map import spark_to_feast_value_type
1820
from feast.value_type import ValueType
1921

2022
logger = logging.getLogger(__name__)
2123

2224

23-
class SparkSourceFormat(Enum):
25+
class SparkFileSourceFormat(Enum):
2426
csv = "csv"
2527
json = "json"
2628
parquet = "parquet"
27-
delta = "delta"
2829
avro = "avro"
2930

3031

@@ -42,6 +43,7 @@ def __init__(
4243
query: Optional[str] = None,
4344
path: Optional[str] = None,
4445
file_format: Optional[str] = None,
46+
table_format: Optional[TableFormat] = None,
4547
created_timestamp_column: Optional[str] = None,
4648
field_mapping: Optional[Dict[str, str]] = None,
4749
description: Optional[str] = "",
@@ -58,7 +60,9 @@ def __init__(
5860
table: The name of a Spark table.
5961
query: The query to be executed in Spark.
6062
path: The path to file data.
61-
file_format: The format of the file data.
63+
file_format: The underlying file format (parquet, avro, csv, json).
64+
table_format: The table metadata format (iceberg, delta, hudi, etc.).
65+
Optional and separate from file_format.
6266
created_timestamp_column: Timestamp column indicating when the row
6367
was created, used for deduplicating rows.
6468
field_mapping: A dictionary mapping of column names in this data
@@ -70,7 +74,7 @@ def __init__(
7074
timestamp_field: Event timestamp field used for point-in-time joins of
7175
feature values.
7276
date_partition_column: The column to partition the data on for faster
73-
retrieval. This is useful for large tables and will limit the number ofi
77+
retrieval. This is useful for large tables and will limit the number of
7478
"""
7579
# If no name, use the table as the default name.
7680
if name is None and table is None:
@@ -102,6 +106,7 @@ def __init__(
102106
path=path,
103107
file_format=file_format,
104108
date_partition_column_format=date_partition_column_format,
109+
table_format=table_format,
105110
)
106111

107112
@property
@@ -132,6 +137,13 @@ def file_format(self):
132137
"""
133138
return self.spark_options.file_format
134139

140+
@property
141+
def table_format(self):
142+
"""
143+
Returns the table format of this feature data source.
144+
"""
145+
return self.spark_options.table_format
146+
135147
@property
136148
def date_partition_column_format(self):
137149
"""
@@ -219,7 +231,7 @@ def get_table_query_string(self) -> str:
219231
if spark_session is None:
220232
raise AssertionError("Could not find an active spark session.")
221233
try:
222-
df = spark_session.read.format(self.file_format).load(self.path)
234+
df = self._load_dataframe_from_path(spark_session)
223235
except Exception:
224236
logger.exception(
225237
"Spark read of file source failed.\n" + traceback.format_exc()
@@ -230,6 +242,25 @@ def get_table_query_string(self) -> str:
230242

231243
return f"`{tmp_table_name}`"
232244

245+
def _load_dataframe_from_path(self, spark_session):
246+
"""Load DataFrame from path, considering both file format and table format."""
247+
from feast.table_format import TableFormatType
248+
249+
if self.table_format is None:
250+
# No table format specified, use standard file reading with file_format
251+
return spark_session.read.format(self.file_format).load(self.path)
252+
253+
# Build reader with table format and options
254+
reader = spark_session.read.format(self.table_format.format_type.value)
255+
256+
# Add table format specific options
257+
for key, value in self.table_format.properties.items():
258+
reader = reader.option(key, value)
259+
260+
# For catalog-based table formats like Iceberg, the path is actually a table name
261+
# For file-based formats, it's still a file path
262+
return reader.load(self.path)
263+
233264
def __eq__(self, other):
234265
base_eq = super().__eq__(other)
235266
if not base_eq:
@@ -245,7 +276,7 @@ def __hash__(self):
245276

246277

247278
class SparkOptions:
248-
allowed_formats = [format.value for format in SparkSourceFormat]
279+
allowed_formats = [format.value for format in SparkFileSourceFormat]
249280

250281
def __init__(
251282
self,
@@ -254,6 +285,7 @@ def __init__(
254285
path: Optional[str],
255286
file_format: Optional[str],
256287
date_partition_column_format: Optional[str] = "%Y-%m-%d",
288+
table_format: Optional[TableFormat] = None,
257289
):
258290
# Check that only one of the ways to load a spark dataframe can be used. We have
259291
# to treat empty string and null the same due to proto (de)serialization.
@@ -262,11 +294,14 @@ def __init__(
262294
"Exactly one of params(table, query, path) must be specified."
263295
)
264296
if path:
265-
if not file_format:
297+
# If table_format is specified, file_format is optional (table format determines the reader)
298+
# If no table_format, file_format is required for basic file reading
299+
if not table_format and not file_format:
266300
raise ValueError(
267-
"If 'path' is specified, then 'file_format' is required."
301+
"If 'path' is specified without 'table_format', then 'file_format' is required."
268302
)
269-
if file_format not in self.allowed_formats:
303+
# Only validate file_format if it's provided (it's optional with table_format)
304+
if file_format and file_format not in self.allowed_formats:
270305
raise ValueError(
271306
f"'file_format' should be one of {self.allowed_formats}"
272307
)
@@ -276,6 +311,7 @@ def __init__(
276311
self._path = path
277312
self._file_format = file_format
278313
self._date_partition_column_format = date_partition_column_format
314+
self._table_format = table_format
279315

280316
@property
281317
def table(self):
@@ -317,6 +353,14 @@ def date_partition_column_format(self):
317353
def date_partition_column_format(self, date_partition_column_format):
318354
self._date_partition_column_format = date_partition_column_format
319355

356+
@property
357+
def table_format(self):
358+
return self._table_format
359+
360+
@table_format.setter
361+
def table_format(self, table_format):
362+
self._table_format = table_format
363+
320364
@classmethod
321365
def from_proto(cls, spark_options_proto: DataSourceProto.SparkOptions):
322366
"""
@@ -332,6 +376,9 @@ def from_proto(cls, spark_options_proto: DataSourceProto.SparkOptions):
332376
path=spark_options_proto.path,
333377
file_format=spark_options_proto.file_format,
334378
date_partition_column_format=spark_options_proto.date_partition_column_format,
379+
table_format=table_format_from_dict(
380+
json.loads(spark_options_proto.table_format)
381+
)
335382
)
336383

337384
return spark_options
@@ -348,6 +395,7 @@ def to_proto(self) -> DataSourceProto.SparkOptions:
348395
path=self.path,
349396
file_format=self.file_format,
350397
date_partition_column_format=self.date_partition_column_format,
398+
table_format=json.dumps(self.table_format.to_dict()) if self.table_format else "",
351399
)
352400

353401
return spark_options_proto
@@ -364,12 +412,14 @@ def __init__(
364412
query: Optional[str] = None,
365413
path: Optional[str] = None,
366414
file_format: Optional[str] = None,
415+
table_format: Optional[TableFormat] = None,
367416
):
368417
self.spark_options = SparkOptions(
369418
table=table,
370419
query=query,
371420
path=path,
372421
file_format=file_format,
422+
table_format=table_format,
373423
)
374424

375425
@staticmethod
@@ -380,6 +430,7 @@ def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage:
380430
query=spark_options.query,
381431
path=spark_options.path,
382432
file_format=spark_options.file_format,
433+
table_format=spark_options.table_format,
383434
)
384435

385436
def to_proto(self) -> SavedDatasetStorageProto:
@@ -391,4 +442,5 @@ def to_data_source(self) -> DataSource:
391442
query=self.spark_options.query,
392443
path=self.spark_options.path,
393444
file_format=self.spark_options.file_format,
445+
table_format=self.spark_options.table_format,
394446
)

0 commit comments

Comments
 (0)