feast-dev · franciscojavierarceo · Apr 18, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
@@ -31,10 +31,11 @@ This system builds and executes DAGs (Directed Acyclic Graphs) of typed operatio
 - Supports point-in-time joins and large-scale materialization
 - Integrates with `SparkOfflineStore` and `SparkMaterializationJob`
 
-### 🧪 LocalComputeEngine (WIP)
+### 🧪 LocalComputeEngine
 
-- Runs on Arrow + Pandas (or optionally DuckDB)
+- Runs on Arrow + Specified backend (e.g., Pandas, Polars)
 - Designed for local dev, testing, or lightweight feature generation
+- Supports `LocalMaterializationJob` and `LocalHistoricalRetrievalJob`
 
 ---
 

@@ -0,0 +1,59 @@
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Callable, Optional, Union
+
+from tqdm import tqdm
+
+from feast import BatchFeatureView, FeatureView, StreamFeatureView
+
+
+@dataclass
+class MaterializationTask:
+    """
+    A MaterializationTask represents a unit of data that needs to be materialized from an
+    offline store to an online store.
+    """
+
+    project: str
+    feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView]
+    start_time: datetime
+    end_time: datetime
+    tqdm_builder: Callable[[int], tqdm]
+
+
+class MaterializationJobStatus(enum.Enum):
+    WAITING = 1
+    RUNNING = 2
+    AVAILABLE = 3
+    ERROR = 4
+    CANCELLING = 5
+    CANCELLED = 6
+    SUCCEEDED = 7
+    PAUSED = 8
+    RETRYING = 9
+
+
+class MaterializationJob(ABC):
+    """
+    A MaterializationJob represents an ongoing or executed process that materializes data as per the
+    definition of a materialization task.
+    """
+
+    task: MaterializationTask
+
+    @abstractmethod
+    def status(self) -> MaterializationJobStatus: ...
+
+    @abstractmethod
+    def error(self) -> Optional[BaseException]: ...
+
+    @abstractmethod
+    def should_be_retried(self) -> bool: ...
+
+    @abstractmethod
+    def job_id(self) -> str: ...
+
+    @abstractmethod
+    def url(self) -> Optional[str]: ...
@@ -4,12 +4,12 @@
 import pyarrow as pa
 
 from feast import RepoConfig
-from feast.infra.compute_engines.dag.context import ColumnInfo, ExecutionContext
-from feast.infra.compute_engines.tasks import HistoricalRetrievalTask
-from feast.infra.materialization.batch_materialization_engine import (
+from feast.infra.common.materialization_job import (
     MaterializationJob,
     MaterializationTask,
 )
+from feast.infra.common.retrieval_task import HistoricalRetrievalTask
+from feast.infra.compute_engines.dag.context import ColumnInfo, ExecutionContext
 from feast.infra.offline_stores.offline_store import OfflineStore
 from feast.infra.online_stores.online_store import OnlineStore
 from feast.infra.registry.registry import Registry

@@ -1,11 +1,10 @@
 from abc import ABC, abstractmethod
 from typing import Union
 
-from feast import BatchFeatureView, FeatureView, StreamFeatureView
+from feast.infra.common.materialization_job import MaterializationTask
+from feast.infra.common.retrieval_task import HistoricalRetrievalTask
 from feast.infra.compute_engines.dag.node import DAGNode
 from feast.infra.compute_engines.dag.plan import ExecutionPlan
-from feast.infra.compute_engines.tasks import HistoricalRetrievalTask
-from feast.infra.materialization.batch_materialization_engine import MaterializationTask
 
 
 class FeatureBuilder(ABC):
@@ -16,10 +15,9 @@ class FeatureBuilder(ABC):
 
     def __init__(
         self,
-        feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView],
         task: Union[MaterializationTask, HistoricalRetrievalTask],
     ):
-        self.feature_view = feature_view
+        self.feature_view = task.feature_view
         self.task = task
         self.nodes: list[DAGNode] = []
 

@@ -0,0 +1,12 @@
+import pyarrow as pa
+
+from feast.infra.compute_engines.dag.model import DAGFormat
+from feast.infra.compute_engines.dag.value import DAGValue
+
+
+class ArrowTableValue(DAGValue):
+    def __init__(self, data: pa.Table):
+        super().__init__(data, DAGFormat.ARROW)
+
+    def __repr__(self):
+        return f"ArrowTableValue(schema={self.data.schema}, rows={self.data.num_rows})"
@@ -0,0 +1,79 @@
+from abc import ABC, abstractmethod
+from datetime import timedelta
+
+
+class DataFrameBackend(ABC):
+    """
+    Abstract interface for DataFrame operations used by the LocalComputeEngine.
+
+    This interface defines the contract for implementing pluggable DataFrame backends
+    such as Pandas, Polars, or DuckDB. Each backend must support core table operations
+    such as joins, filtering, aggregation, conversion to/from Arrow, and deduplication.
+
+    The purpose of this abstraction is to allow seamless swapping of execution backends
+    without changing DAGNode or ComputeEngine logic. All nodes operate on pyarrow.Table
+    as the standard input/output format, while the backend defines how the computation
+    is actually performed.
+
+    Expected implementations include:
+    - PandasBackend
+    - PolarsBackend
+    - DuckDBBackend (future)
+
+    Methods
+    -------
+    from_arrow(table: pa.Table) -> Any
+        Convert a pyarrow.Table to the backend-native DataFrame format.
+
+    to_arrow(df: Any) -> pa.Table
+        Convert a backend-native DataFrame to pyarrow.Table.
+
+    join(left: Any, right: Any, on: List[str], how: str) -> Any
+        Join two dataframes on specified keys with given join type.
+
+    groupby_agg(df: Any, group_keys: List[str], agg_ops: Dict[str, Tuple[str, str]]) -> Any
+        Group and aggregate the dataframe. `agg_ops` maps output column names
+        to (aggregation function, source column name) pairs.
+
+    filter(df: Any, expr: str) -> Any
+        Apply a filter expression (string-based) to the DataFrame.
+
+    to_timedelta_value(delta: timedelta) -> Any
+        Convert a Python timedelta object to a backend-compatible value
+        that can be subtracted from a timestamp column.
+
+    drop_duplicates(df: Any, keys: List[str], sort_by: List[str], ascending: bool = False) -> Any
+        Deduplicate the DataFrame by key columns, keeping the first row
+        by descending or ascending sort order.
+
+    rename_columns(df: Any, columns: Dict[str, str]) -> Any
+        Rename columns in the DataFrame according to the provided mapping.
+    """
+
+    @abstractmethod
+    def columns(self, df): ...
+
+    @abstractmethod
+    def from_arrow(self, table): ...
+
+    @abstractmethod
+    def join(self, left, right, on, how): ...
+
+    @abstractmethod
+    def groupby_agg(self, df, group_keys, agg_ops): ...
+
+    @abstractmethod
+    def filter(self, df, expr): ...
+
+    @abstractmethod
+    def to_arrow(self, df): ...
+
+    @abstractmethod
+    def to_timedelta_value(self, delta: timedelta): ...
+
+    @abstractmethod
+    def drop_duplicates(self, df, keys, sort_by, ascending: bool = False):
+        pass
+
+    @abstractmethod
+    def rename_columns(self, df, columns: dict[str, str]): ...
@@ -0,0 +1,49 @@
+from typing import Optional
+
+import pandas as pd
+import pyarrow
+
+from feast.infra.compute_engines.local.backends.base import DataFrameBackend
+from feast.infra.compute_engines.local.backends.pandas_backend import PandasBackend
+
+
+class BackendFactory:
+    """
+    Factory class for constructing DataFrameBackend implementations based on backend name
+    or runtime entity_df type.
+    """
+
+    @staticmethod
+    def from_name(name: str) -> DataFrameBackend:
+        if name == "pandas":
+            return PandasBackend()
+        if name == "polars":
+            return BackendFactory._get_polars_backend()
+        raise ValueError(f"Unsupported backend name: {name}")
+
+    @staticmethod
+    def infer_from_entity_df(entity_df) -> Optional[DataFrameBackend]:
+        if isinstance(entity_df, pyarrow.Table) or isinstance(entity_df, pd.DataFrame):
+            return PandasBackend()
+
+        if BackendFactory._is_polars(entity_df):
+            return BackendFactory._get_polars_backend()
+        return None
+
+    @staticmethod
+    def _is_polars(entity_df) -> bool:
+        try:
+            import polars as pl
+        except ImportError:
+            raise ImportError(
+                "Polars is not installed. Please install it to use Polars backend."
+            )
+        return isinstance(entity_df, pl.DataFrame)
+
+    @staticmethod
+    def _get_polars_backend():
+        from feast.infra.compute_engines.local.backends.polars_backend import (
+            PolarsBackend,
+        )
+
+        return PolarsBackend()
@@ -0,0 +1,46 @@
+from datetime import timedelta
+
+import pandas as pd
+import pyarrow as pa
+
+from feast.infra.compute_engines.local.backends.base import DataFrameBackend
+
+
+class PandasBackend(DataFrameBackend):
+    def columns(self, df):
+        return df.columns.tolist()
+
+    def from_arrow(self, table):
+        return table.to_pandas()
+
+    def join(self, left, right, on, how):
+        return left.merge(right, on=on, how=how)
+
+    def groupby_agg(self, df, group_keys, agg_ops):
+        return (
+            df.groupby(group_keys)
+            .agg(
+                **{
+                    alias: pd.NamedAgg(column=col, aggfunc=func)
+                    for alias, (func, col) in agg_ops.items()
+                }
+            )
+            .reset_index()
+        )
+
+    def filter(self, df, expr):
+        return df.query(expr)
+
+    def to_arrow(self, df):
+        return pa.Table.from_pandas(df)
+
+    def to_timedelta_value(self, delta: timedelta):
+        return pd.to_timedelta(delta)
+
+    def drop_duplicates(self, df, keys, sort_by, ascending: bool = False):
+        return df.sort_values(by=sort_by, ascending=ascending).drop_duplicates(
+            subset=keys
+        )
+
+    def rename_columns(self, df, columns: dict[str, str]):
+        return df.rename(columns=columns)
@@ -0,0 +1,47 @@
+from datetime import timedelta
+
+import polars as pl
+import pyarrow as pa
+
+from feast.infra.compute_engines.local.backends.base import DataFrameBackend
+
+
+class PolarsBackend(DataFrameBackend):
+    def columns(self, df):
+        pass
+
+    def from_arrow(self, table: pa.Table) -> pl.DataFrame:
+        return pl.from_arrow(table)
+
+    def to_arrow(self, df: pl.DataFrame) -> pa.Table:
+        return df.to_arrow()
+
+    def join(self, left: pl.DataFrame, right: pl.DataFrame, on, how) -> pl.DataFrame:
+        return left.join(right, on=on, how=how)
+
+    def groupby_agg(self, df: pl.DataFrame, group_keys, agg_ops) -> pl.DataFrame:
+        agg_exprs = [
+            getattr(pl.col(col), func)().alias(alias)
+            for alias, (func, col) in agg_ops.items()
+        ]
+        return df.groupby(group_keys).agg(agg_exprs)
+
+    def filter(self, df: pl.DataFrame, expr: str) -> pl.DataFrame:
+        return df.filter(pl.sql_expr(expr))
+
+    def to_timedelta_value(self, delta: timedelta):
+        return pl.duration(milliseconds=delta.total_seconds() * 1000)
+
+    def drop_duplicates(
+        self,
+        df: pl.DataFrame,
+        keys: list[str],
+        sort_by: list[str],
+        ascending: bool = False,
+    ) -> pl.DataFrame:
+        return df.sort(by=sort_by, descending=not ascending).unique(
+            subset=keys, keep="first"
+        )
+
+    def rename_columns(self, df: pl.DataFrame, columns: dict[str, str]) -> pl.DataFrame:
+        return df.rename(columns)