1+ import json
12import logging
23import traceback
34import warnings
1415)
1516from feast .repo_config import RepoConfig
1617from feast .saved_dataset import SavedDatasetStorage
18+ from feast .table_format import TableFormat , TableFormatType , table_format_from_dict
1719from feast .type_map import spark_to_feast_value_type
1820from feast .value_type import ValueType
1921
2022logger = logging .getLogger (__name__ )
2123
2224
23- class SparkSourceFormat (Enum ):
25+ class SparkFileSourceFormat (Enum ):
2426 csv = "csv"
2527 json = "json"
2628 parquet = "parquet"
27- delta = "delta"
2829 avro = "avro"
2930
3031
@@ -42,6 +43,7 @@ def __init__(
4243 query : Optional [str ] = None ,
4344 path : Optional [str ] = None ,
4445 file_format : Optional [str ] = None ,
46+ table_format : Optional [TableFormat ] = None ,
4547 created_timestamp_column : Optional [str ] = None ,
4648 field_mapping : Optional [Dict [str , str ]] = None ,
4749 description : Optional [str ] = "" ,
@@ -58,7 +60,9 @@ def __init__(
5860 table: The name of a Spark table.
5961 query: The query to be executed in Spark.
6062 path: The path to file data.
61- file_format: The format of the file data.
63+ file_format: The underlying file format (parquet, avro, csv, json).
64+ table_format: The table metadata format (iceberg, delta, hudi, etc.).
65+ Optional and separate from file_format.
6266 created_timestamp_column: Timestamp column indicating when the row
6367 was created, used for deduplicating rows.
6468 field_mapping: A dictionary mapping of column names in this data
@@ -70,7 +74,7 @@ def __init__(
7074 timestamp_field: Event timestamp field used for point-in-time joins of
7175 feature values.
7276 date_partition_column: The column to partition the data on for faster
73- retrieval. This is useful for large tables and will limit the number ofi
77+ retrieval. This is useful for large tables and will limit the number of
7478 """
7579 # If no name, use the table as the default name.
7680 if name is None and table is None :
@@ -102,6 +106,7 @@ def __init__(
102106 path = path ,
103107 file_format = file_format ,
104108 date_partition_column_format = date_partition_column_format ,
109+ table_format = table_format ,
105110 )
106111
107112 @property
@@ -132,6 +137,13 @@ def file_format(self):
132137 """
133138 return self .spark_options .file_format
134139
140+ @property
141+ def table_format (self ):
142+ """
143+ Returns the table format of this feature data source.
144+ """
145+ return self .spark_options .table_format
146+
135147 @property
136148 def date_partition_column_format (self ):
137149 """
@@ -219,7 +231,7 @@ def get_table_query_string(self) -> str:
219231 if spark_session is None :
220232 raise AssertionError ("Could not find an active spark session." )
221233 try :
222- df = spark_session . read . format ( self .file_format ). load ( self . path )
234+ df = self ._load_dataframe_from_path ( spark_session )
223235 except Exception :
224236 logger .exception (
225237 "Spark read of file source failed.\n " + traceback .format_exc ()
@@ -230,6 +242,25 @@ def get_table_query_string(self) -> str:
230242
231243 return f"`{ tmp_table_name } `"
232244
245+ def _load_dataframe_from_path (self , spark_session ):
246+ """Load DataFrame from path, considering both file format and table format."""
247+ from feast .table_format import TableFormatType
248+
249+ if self .table_format is None :
250+ # No table format specified, use standard file reading with file_format
251+ return spark_session .read .format (self .file_format ).load (self .path )
252+
253+ # Build reader with table format and options
254+ reader = spark_session .read .format (self .table_format .format_type .value )
255+
256+ # Add table format specific options
257+ for key , value in self .table_format .properties .items ():
258+ reader = reader .option (key , value )
259+
260+ # For catalog-based table formats like Iceberg, the path is actually a table name
261+ # For file-based formats, it's still a file path
262+ return reader .load (self .path )
263+
233264 def __eq__ (self , other ):
234265 base_eq = super ().__eq__ (other )
235266 if not base_eq :
@@ -245,7 +276,7 @@ def __hash__(self):
245276
246277
247278class SparkOptions :
248- allowed_formats = [format .value for format in SparkSourceFormat ]
279+ allowed_formats = [format .value for format in SparkFileSourceFormat ]
249280
250281 def __init__ (
251282 self ,
@@ -254,6 +285,7 @@ def __init__(
254285 path : Optional [str ],
255286 file_format : Optional [str ],
256287 date_partition_column_format : Optional [str ] = "%Y-%m-%d" ,
288+ table_format : Optional [TableFormat ] = None ,
257289 ):
258290 # Check that only one of the ways to load a spark dataframe can be used. We have
259291 # to treat empty string and null the same due to proto (de)serialization.
@@ -262,11 +294,14 @@ def __init__(
262294 "Exactly one of params(table, query, path) must be specified."
263295 )
264296 if path :
265- if not file_format :
297+ # If table_format is specified, file_format is optional (table format determines the reader)
298+ # If no table_format, file_format is required for basic file reading
299+ if not table_format and not file_format :
266300 raise ValueError (
267- "If 'path' is specified, then 'file_format' is required."
301+ "If 'path' is specified without 'table_format' , then 'file_format' is required."
268302 )
269- if file_format not in self .allowed_formats :
303+ # Only validate file_format if it's provided (it's optional with table_format)
304+ if file_format and file_format not in self .allowed_formats :
270305 raise ValueError (
271306 f"'file_format' should be one of { self .allowed_formats } "
272307 )
@@ -276,6 +311,7 @@ def __init__(
276311 self ._path = path
277312 self ._file_format = file_format
278313 self ._date_partition_column_format = date_partition_column_format
314+ self ._table_format = table_format
279315
280316 @property
281317 def table (self ):
@@ -317,6 +353,14 @@ def date_partition_column_format(self):
317353 def date_partition_column_format (self , date_partition_column_format ):
318354 self ._date_partition_column_format = date_partition_column_format
319355
356+ @property
357+ def table_format (self ):
358+ return self ._table_format
359+
360+ @table_format .setter
361+ def table_format (self , table_format ):
362+ self ._table_format = table_format
363+
320364 @classmethod
321365 def from_proto (cls , spark_options_proto : DataSourceProto .SparkOptions ):
322366 """
@@ -332,6 +376,9 @@ def from_proto(cls, spark_options_proto: DataSourceProto.SparkOptions):
332376 path = spark_options_proto .path ,
333377 file_format = spark_options_proto .file_format ,
334378 date_partition_column_format = spark_options_proto .date_partition_column_format ,
379+ table_format = table_format_from_dict (
380+ json .loads (spark_options_proto .table_format )
381+ )
335382 )
336383
337384 return spark_options
@@ -348,6 +395,7 @@ def to_proto(self) -> DataSourceProto.SparkOptions:
348395 path = self .path ,
349396 file_format = self .file_format ,
350397 date_partition_column_format = self .date_partition_column_format ,
398+ table_format = json .dumps (self .table_format .to_dict ()) if self .table_format else "" ,
351399 )
352400
353401 return spark_options_proto
@@ -364,12 +412,14 @@ def __init__(
364412 query : Optional [str ] = None ,
365413 path : Optional [str ] = None ,
366414 file_format : Optional [str ] = None ,
415+ table_format : Optional [TableFormat ] = None ,
367416 ):
368417 self .spark_options = SparkOptions (
369418 table = table ,
370419 query = query ,
371420 path = path ,
372421 file_format = file_format ,
422+ table_format = table_format ,
373423 )
374424
375425 @staticmethod
@@ -380,6 +430,7 @@ def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage:
380430 query = spark_options .query ,
381431 path = spark_options .path ,
382432 file_format = spark_options .file_format ,
433+ table_format = spark_options .table_format ,
383434 )
384435
385436 def to_proto (self ) -> SavedDatasetStorageProto :
@@ -391,4 +442,5 @@ def to_data_source(self) -> DataSource:
391442 query = self .spark_options .query ,
392443 path = self .spark_options .path ,
393444 file_format = self .spark_options .file_format ,
445+ table_format = self .spark_options .table_format ,
394446 )
0 commit comments