feast-dev · ntkathole · Dec 8, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 12, 2025
@@ -1,6 +1,6 @@
 import os
 import uuid
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 
@@ -37,7 +37,7 @@
 from feast.on_demand_feature_view import OnDemandFeatureView
 from feast.repo_config import FeastConfigBaseModel, RepoConfig
 from feast.saved_dataset import SavedDatasetStorage
-from feast.utils import _get_requested_feature_views_to_features_dict
+from feast.utils import _get_requested_feature_views_to_features_dict, make_tzaware
 
 # DaskRetrievalJob will cast string objects to string[pyarrow] from dask version 2023.7.1
 # This is not the desired behavior for our use case, so we set the convert-string option to False
@@ -133,21 +133,56 @@ def get_historical_features(
         config: RepoConfig,
         feature_views: List[FeatureView],
         feature_refs: List[str],
-        entity_df: Union[pd.DataFrame, str],
+        entity_df: Optional[Union[pd.DataFrame, dd.DataFrame, str]],
         registry: BaseRegistry,
         project: str,
         full_feature_names: bool = False,
+        **kwargs,
     ) -> RetrievalJob:
         assert isinstance(config.offline_store, DaskOfflineStoreConfig)
         for fv in feature_views:
             assert isinstance(fv.batch_source, FileSource)
 
-        if not isinstance(entity_df, pd.DataFrame) and not isinstance(
-            entity_df, dd.DataFrame
-        ):
-            raise ValueError(
-                f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}"
+        # Allow non-entity mode using start/end timestamps to enable bounded retrievals without an input entity_df.
+        # This synthesizes a minimal entity_df solely to drive the existing join and metadata plumbing without
+        # incurring source scans here; actual pushdowns can be layered in follow-ups if needed.
+        start_date: Optional[datetime] = kwargs.get("start_date", None)
+        end_date: Optional[datetime] = kwargs.get("end_date", None)
+        non_entity_mode = entity_df is None
+
+        if non_entity_mode:
+            # Default end_date to current time (UTC) to keep behavior predictable without extra parameters.
+            end_date = (
+                make_tzaware(end_date) if end_date else datetime.now(timezone.utc)
+            )
+
+            # When start_date is not provided, choose a conservative lower bound using max TTL, otherwise fall back.
+            if start_date is None:
+                max_ttl_seconds = 0
+                for fv in feature_views:
+                    if fv.ttl and isinstance(fv.ttl, timedelta):
+                        max_ttl_seconds = max(
+                            max_ttl_seconds, int(fv.ttl.total_seconds())
+                        )
+                if max_ttl_seconds > 0:
+                    start_date = end_date - timedelta(seconds=max_ttl_seconds)
+                else:
+                    # Keep default window bounded to avoid unbounded scans by default.
+                    start_date = end_date - timedelta(days=30)
+            start_date = make_tzaware(start_date)
+
+            # Minimal synthetic entity_df: one timestamp row; join keys are not materialized here on purpose to avoid
+            # accidental dependence on specific feature view schemas at this layer.
+            entity_df = pd.DataFrame(
+                {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL: [end_date]}
             )
+        else:
+            if not isinstance(entity_df, pd.DataFrame) and not isinstance(
+                entity_df, dd.DataFrame
+            ):
+                raise ValueError(
+                    f"Please provide an entity_df of type pd.DataFrame or dask.dataframe.DataFrame instead of type {type(entity_df)}"
+                )
         entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL  # local modifiable copy of global variable
         if entity_df_event_timestamp_col not in entity_df.columns:
             datetime_columns = entity_df.select_dtypes(
@@ -171,8 +206,12 @@ def get_historical_features(
             registry.list_on_demand_feature_views(config.project),
         )
 
-        entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range(
-            entity_df, entity_df_event_timestamp_col
+        entity_df_event_timestamp_range = (
+            (start_date, end_date)
+            if non_entity_mode
+            else _get_entity_df_event_timestamp_range(
+                entity_df, entity_df_event_timestamp_col
+            )
         )
 
         # Create lazy function that is only called from the RetrievalJob object
@@ -260,7 +299,16 @@ def evaluate_historical_retrieval():
                     full_feature_names,
                 )
 
-                df_to_join = _merge(entity_df_with_features, df_to_join, join_keys)
+                # In non-entity mode, if the synthetic entity_df lacks join keys, cross join to build a snapshot
+                # of all entities as-of the requested timestamp, then rely on TTL and deduplication to select
+                # the appropriate latest rows per entity.
+                current_join_keys = join_keys
+                if non_entity_mode:
+                    current_join_keys = []
+
+                df_to_join = _merge(
+                    entity_df_with_features, df_to_join, current_join_keys
+                )
 
                 df_to_join = _normalize_timestamp(
                     df_to_join, timestamp_field, created_timestamp_column