-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: Feast dataframe phase1 #5611
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
37923de
Add FeastDataFrame
HaoXuAI e965911
linting
HaoXuAI bc7f7f4
linting
HaoXuAI e786ef1
linting
HaoXuAI 778d19e
Merge branch 'master' into feat/feast-dataframe-phase1
HaoXuAI bc825a1
update init
HaoXuAI 4f972fc
linting
HaoXuAI 64ac8a9
fix testing
HaoXuAI File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| """FeastDataFrame: A lightweight container for DataFrame-like objects in Feast.""" | ||
|
|
||
| from enum import Enum | ||
| from typing import Any, Dict, Optional | ||
|
|
||
| import pandas as pd | ||
| import pyarrow as pa | ||
|
|
||
|
|
||
| class DataFrameEngine(str, Enum): | ||
| """Supported DataFrame engines.""" | ||
|
|
||
| PANDAS = "pandas" | ||
| SPARK = "spark" | ||
| DASK = "dask" | ||
| RAY = "ray" | ||
| ARROW = "arrow" | ||
| POLARS = "polars" | ||
| UNKNOWN = "unknown" | ||
|
|
||
|
|
||
| class FeastDataFrame: | ||
| """ | ||
| A lightweight container for DataFrame-like objects in Feast. | ||
|
|
||
| This class wraps any DataFrame implementation and provides metadata | ||
| about the engine type for proper routing in Feast's processing pipeline. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| data: Any, | ||
| engine: Optional[DataFrameEngine] = None, | ||
| metadata: Optional[Dict[str, Any]] = None, | ||
| ): | ||
| """ | ||
| Initialize a FeastDataFrame. | ||
|
|
||
| Args: | ||
| data: The wrapped DataFrame object (pandas, Spark, Dask, etc.) | ||
| engine: Explicitly specify the engine type (auto-detected if None) | ||
| metadata: Additional metadata (schema hints, etc.) | ||
| """ | ||
| self.data = data | ||
| self.metadata = metadata or {} | ||
|
|
||
| # Detect the actual engine from the data | ||
| detected_engine = self._detect_engine() | ||
|
|
||
| if engine is not None: | ||
| # Validate that the provided engine matches the detected engine | ||
| if engine != detected_engine: | ||
| raise ValueError( | ||
| f"Provided engine '{engine.value}' does not match detected engine '{detected_engine.value}' " | ||
| f"for data type {type(data).__name__}" | ||
| ) | ||
| self._engine = engine | ||
| else: | ||
| self._engine = detected_engine | ||
|
|
||
| def _detect_engine(self) -> DataFrameEngine: | ||
| """Auto-detect the DataFrame engine based on type.""" | ||
| if isinstance(self.data, pd.DataFrame): | ||
| return DataFrameEngine.PANDAS | ||
| elif isinstance(self.data, pa.Table): | ||
| return DataFrameEngine.ARROW | ||
|
|
||
| # For optional dependencies, check module name to avoid import errors | ||
| module = type(self.data).__module__ | ||
| if "pyspark" in module: | ||
| return DataFrameEngine.SPARK | ||
| elif "dask" in module: | ||
| return DataFrameEngine.DASK | ||
| elif "ray" in module: | ||
| return DataFrameEngine.RAY | ||
| elif "polars" in module: | ||
| return DataFrameEngine.POLARS | ||
| else: | ||
| return DataFrameEngine.UNKNOWN | ||
|
|
||
| @property | ||
| def engine(self) -> DataFrameEngine: | ||
| """Get the detected or specified engine type.""" | ||
| return self._engine | ||
|
|
||
| def __repr__(self): | ||
| return f"FeastDataFrame(engine={self.engine.value}, type={type(self.data).__name__})" | ||
|
|
||
| @property | ||
| def is_lazy(self) -> bool: | ||
| """Check if the underlying DataFrame is lazy (Spark, Dask, Ray).""" | ||
| return self.engine in [ | ||
| DataFrameEngine.SPARK, | ||
| DataFrameEngine.DASK, | ||
| DataFrameEngine.RAY, | ||
| ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,139 @@ | ||
| """Unit tests for FeastDataFrame.""" | ||
|
|
||
| import pandas as pd | ||
| import pyarrow as pa | ||
| import pytest | ||
|
|
||
| from feast.dataframe import DataFrameEngine, FeastDataFrame | ||
|
|
||
|
|
||
| class TestFeastDataFrame: | ||
| """Test suite for FeastDataFrame functionality.""" | ||
|
|
||
| def test_pandas_detection(self): | ||
| """Test auto-detection of pandas DataFrame.""" | ||
| df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) | ||
| feast_df = FeastDataFrame(df) | ||
|
|
||
| assert feast_df.engine == DataFrameEngine.PANDAS | ||
| assert not feast_df.is_lazy | ||
| assert isinstance(feast_df.data, pd.DataFrame) | ||
|
|
||
| def test_arrow_detection(self): | ||
| """Test auto-detection of Arrow Table.""" | ||
| table = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}) | ||
| feast_df = FeastDataFrame(table) | ||
|
|
||
| assert feast_df.engine == DataFrameEngine.ARROW | ||
| assert not feast_df.is_lazy | ||
| assert isinstance(feast_df.data, pa.Table) | ||
|
|
||
| def test_explicit_engine(self): | ||
| """Test explicit engine specification with unknown data.""" | ||
| data = {"mock": "data"} | ||
| feast_df = FeastDataFrame(data, engine=DataFrameEngine.UNKNOWN) | ||
|
|
||
| assert feast_df.engine == DataFrameEngine.UNKNOWN | ||
| assert not feast_df.is_lazy | ||
|
|
||
| def test_unknown_engine(self): | ||
| """Test handling of unknown DataFrame types.""" | ||
| data = {"some": "dict"} | ||
| feast_df = FeastDataFrame(data) | ||
|
|
||
| assert feast_df.engine == DataFrameEngine.UNKNOWN | ||
|
|
||
| def test_metadata(self): | ||
| """Test metadata handling.""" | ||
| df = pd.DataFrame({"a": [1, 2, 3]}) | ||
| metadata = {"features": ["a"], "source": "test"} | ||
| feast_df = FeastDataFrame(df, metadata=metadata) | ||
|
|
||
| assert feast_df.metadata == metadata | ||
| assert feast_df.metadata["features"] == ["a"] | ||
|
|
||
| def test_repr(self): | ||
| """Test string representation.""" | ||
| df = pd.DataFrame({"a": [1, 2, 3]}) | ||
| feast_df = FeastDataFrame(df) | ||
|
|
||
| repr_str = repr(feast_df) | ||
| assert "FeastDataFrame" in repr_str | ||
| assert "engine=pandas" in repr_str | ||
| assert "DataFrame" in repr_str | ||
|
|
||
| def test_is_lazy_property(self): | ||
| """Test is_lazy property for different engines.""" | ||
| # Test with pandas DataFrame (not lazy) | ||
| df = pd.DataFrame({"a": [1, 2, 3]}) | ||
| feast_df = FeastDataFrame(df) | ||
| assert not feast_df.is_lazy | ||
|
|
||
| # Test with Arrow table (not lazy) | ||
| table = pa.table({"a": [1, 2, 3]}) | ||
| feast_df = FeastDataFrame(table) | ||
| assert not feast_df.is_lazy | ||
|
|
||
| # Test with unknown data type (not lazy) | ||
| unknown_data = {"mock": "data"} | ||
| feast_df = FeastDataFrame(unknown_data) | ||
| assert not feast_df.is_lazy | ||
|
|
||
| # Test explicit lazy engines (using unknown data to avoid type validation) | ||
| for lazy_engine in [ | ||
| DataFrameEngine.SPARK, | ||
| DataFrameEngine.DASK, | ||
| DataFrameEngine.RAY, | ||
| ]: | ||
| feast_df = FeastDataFrame(unknown_data, engine=DataFrameEngine.UNKNOWN) | ||
| feast_df._engine = lazy_engine # Override for testing | ||
| assert feast_df.is_lazy | ||
|
|
||
| def test_polars_detection(self): | ||
| """Test detection of polars DataFrame (using mock).""" | ||
|
|
||
| # Mock polars DataFrame | ||
| class MockPolarsDF: | ||
| __module__ = "polars.dataframe.frame" | ||
|
|
||
| def __init__(self): | ||
| pass | ||
|
|
||
| polars_df = MockPolarsDF() | ||
| feast_df = FeastDataFrame(polars_df) | ||
|
|
||
| assert feast_df.engine == DataFrameEngine.POLARS | ||
| assert not feast_df.is_lazy | ||
|
|
||
| def test_engine_validation_valid(self): | ||
| """Test that providing a correct engine passes validation.""" | ||
| df = pd.DataFrame({"a": [1, 2, 3]}) | ||
| feast_df = FeastDataFrame(df, engine=DataFrameEngine.PANDAS) | ||
|
|
||
| assert feast_df.engine == DataFrameEngine.PANDAS | ||
| assert isinstance(feast_df.data, pd.DataFrame) | ||
|
|
||
| def test_engine_validation_invalid(self): | ||
| """Test that providing an incorrect engine raises ValueError.""" | ||
| df = pd.DataFrame({"a": [1, 2, 3]}) | ||
|
|
||
| with pytest.raises( | ||
| ValueError, | ||
| match="Provided engine 'spark' does not match detected engine 'pandas'", | ||
| ): | ||
| FeastDataFrame(df, engine=DataFrameEngine.SPARK) | ||
|
|
||
| def test_engine_validation_arrow(self): | ||
| """Test engine validation with Arrow table.""" | ||
| table = pa.table({"a": [1, 2, 3]}) | ||
|
|
||
| # Valid case | ||
| feast_df = FeastDataFrame(table, engine=DataFrameEngine.ARROW) | ||
| assert feast_df.engine == DataFrameEngine.ARROW | ||
|
|
||
| # Invalid case | ||
| with pytest.raises( | ||
| ValueError, | ||
| match="Provided engine 'pandas' does not match detected engine 'arrow'", | ||
| ): | ||
| FeastDataFrame(table, engine=DataFrameEngine.PANDAS) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
somehow mypy linkting failed. this fixed it.