-
Notifications
You must be signed in to change notification settings - Fork 1.3k
feat: Add non-entity retrieval support for ClickHouse offline store #6066
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| from feast.infra.utils.clickhouse.clickhouse_config import ClickhouseConfig | ||
| from feast.infra.utils.clickhouse.connection_utils import get_client | ||
| from feast.saved_dataset import SavedDatasetStorage | ||
| from feast.utils import _utc_now, make_tzaware | ||
|
|
||
|
|
||
| class ClickhouseOfflineStoreConfig(ClickhouseConfig): | ||
|
|
@@ -43,15 +44,26 @@ def get_historical_features( | |
| config: RepoConfig, | ||
| feature_views: List[FeatureView], | ||
| feature_refs: List[str], | ||
| entity_df: Union[pd.DataFrame, str], | ||
| entity_df: Optional[Union[pd.DataFrame, str]], | ||
| registry: BaseRegistry, | ||
| project: str, | ||
| full_feature_names: bool = False, | ||
| **kwargs, | ||
| ) -> RetrievalJob: | ||
| assert isinstance(config.offline_store, ClickhouseOfflineStoreConfig) | ||
| for fv in feature_views: | ||
| assert isinstance(fv.batch_source, ClickhouseSource) | ||
|
|
||
| # Handle non-entity retrieval mode | ||
| if entity_df is None: | ||
| end_date = kwargs.get("end_date", None) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only Without Consider extracting and forwarding |
||
| if end_date is None: | ||
| end_date = _utc_now() | ||
| else: | ||
| end_date = make_tzaware(end_date) | ||
|
|
||
| entity_df = pd.DataFrame({"event_timestamp": [end_date]}) | ||
|
Comment on lines
+57
to
+65
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Non-entity retrieval silently ignores When Prompt for agentsWas this helpful? React with 👍 or 👎 to provide feedback.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @YassinNouh21 This seems critical issue.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not a bug — this is intentional. The PIT join uses MAX(entity_timestamp) as the upper bound, so the timestamp in the synthetic entity_df IS the query upper bound. Using [end_date] gives the window [end_date - TTL, end_date], which is correct. The Postgres implementation using pd.date_range(start=start_date, ...)[:1] actually has the bug — it takes start_date as the sole timestamp, making end_date unreachable. Our implementation matches Dask and is the correct behavior.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ntkathole sounds like there's a bug in postgres???
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I noticed that
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call using pd.date_range(start=start_date, end=end_date, freq="1s", tz=timezone.utc)[:1]
This implementation correctly uses |
||
|
|
||
| entity_schema = _get_entity_schema(entity_df, config) | ||
devin-ai-integration[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| entity_df_event_timestamp_col = ( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import logging | ||
| import threading | ||
| from datetime import datetime, timedelta, timezone | ||
| from unittest.mock import MagicMock, patch | ||
|
|
||
| import pytest | ||
|
|
@@ -133,3 +134,109 @@ def test_clickhouse_config_handles_none_additional_client_args(): | |
| config = ClickhouseConfig(**raw_config) | ||
|
|
||
| assert config.additional_client_args is None | ||
|
|
||
|
|
||
| class TestNonEntityRetrieval: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests seems over-mocked
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, removed the heavy mocking. For proper coverage, an integration test against a real ClickHouse instance would be more valuable than over-mocked unit tests — the unit tests here may not be necessary at all. |
||
| """Test the non-entity retrieval logic (entity_df=None) for ClickHouse.""" | ||
|
|
||
| _MODULE = "feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse" | ||
|
|
||
| def _call_get_historical_features(self, feature_views, **kwargs): | ||
| """Call get_historical_features with entity_df=None, mocking the pipeline.""" | ||
| from feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse import ( | ||
| ClickhouseOfflineStore, | ||
| ClickhouseOfflineStoreConfig, | ||
| ) | ||
| from feast.repo_config import RepoConfig | ||
|
|
||
| config = RepoConfig( | ||
| project="test_project", | ||
| registry="test_registry", | ||
| provider="local", | ||
| offline_store=ClickhouseOfflineStoreConfig( | ||
| type="clickhouse", | ||
| host="localhost", | ||
| port=9000, | ||
| database="test_db", | ||
| user="default", | ||
| password="password", | ||
| ), | ||
| ) | ||
|
|
||
| end = kwargs.get("end_date", datetime(2023, 1, 7, tzinfo=timezone.utc)) | ||
|
|
||
| with ( | ||
| patch.multiple( | ||
| self._MODULE, | ||
| _upload_entity_df=MagicMock(), | ||
| _get_entity_schema=MagicMock( | ||
| return_value={"event_timestamp": "timestamp"} | ||
| ), | ||
| _get_entity_df_event_timestamp_range=MagicMock( | ||
| return_value=(end - timedelta(days=1), end) | ||
| ), | ||
| ), | ||
| patch( | ||
| f"{self._MODULE}.offline_utils.get_expected_join_keys", | ||
| return_value=[], | ||
| ), | ||
| patch( | ||
| f"{self._MODULE}.offline_utils.assert_expected_columns_in_entity_df", | ||
| ), | ||
| patch( | ||
| f"{self._MODULE}.offline_utils.get_feature_view_query_context", | ||
| return_value=[], | ||
| ), | ||
| ): | ||
| refs = [f"{fv.name}:feature1" for fv in feature_views] | ||
| return ClickhouseOfflineStore.get_historical_features( | ||
| config=config, | ||
| feature_views=feature_views, | ||
| feature_refs=refs, | ||
| entity_df=None, | ||
| registry=MagicMock(), | ||
| project="test_project", | ||
| **kwargs, | ||
| ) | ||
|
|
||
| @staticmethod | ||
| def _make_feature_view(name, ttl=None): | ||
| from feast.entity import Entity | ||
| from feast.feature_view import FeatureView, Field | ||
| from feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse_source import ( | ||
| ClickhouseSource, | ||
| ) | ||
| from feast.types import Float32 | ||
|
|
||
| return FeatureView( | ||
| name=name, | ||
| entities=[Entity(name="driver_id", join_keys=["driver_id"])], | ||
| ttl=ttl, | ||
| source=ClickhouseSource( | ||
| name=f"{name}_source", | ||
| table=f"{name}_table", | ||
| timestamp_field="event_timestamp", | ||
| ), | ||
| schema=[ | ||
| Field(name="feature1", dtype=Float32), | ||
| ], | ||
| ) | ||
|
|
||
| def test_non_entity_mode_with_end_date(self): | ||
| """entity_df=None with explicit end_date produces a valid RetrievalJob.""" | ||
| from feast.infra.offline_stores.offline_store import RetrievalJob | ||
|
|
||
| fv = self._make_feature_view("test_fv") | ||
| job = self._call_get_historical_features( | ||
| [fv], | ||
| end_date=datetime(2023, 1, 7, tzinfo=timezone.utc), | ||
| ) | ||
| assert isinstance(job, RetrievalJob) | ||
|
|
||
| def test_non_entity_mode_defaults_end_date(self): | ||
| """entity_df=None without end_date defaults to now.""" | ||
| from feast.infra.offline_stores.offline_store import RetrievalJob | ||
|
|
||
| fv = self._make_feature_view("test_fv") | ||
| job = self._call_get_historical_features([fv]) | ||
| assert isinstance(job, RetrievalJob) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The unit tests verify that a Also, a test for |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Since only
end_dateis used from**kwargs, consider acceptingend_dateas an explicit keyword argument instead of relying on**kwargs. This makes the API self-documenting and easier to type-check:Though if
start_datesupport is planned, keeping**kwargsfor now is fine.