Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/reference/compute-engine/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@ This system builds and executes DAGs (Directed Acyclic Graphs) of typed operatio
- Supports point-in-time joins and large-scale materialization
- Integrates with `SparkOfflineStore` and `SparkMaterializationJob`

### 🧪 LocalComputeEngine (WIP)
### 🧪 LocalComputeEngine

- Runs on Arrow + Pandas (or optionally DuckDB)
- Runs on Arrow + Specified backend (e.g., Pandas, Polars)
- Designed for local dev, testing, or lightweight feature generation
- Supports `LocalMaterializationJob` and `LocalHistoricalRetrievalJob`

---

Expand Down
Empty file.
59 changes: 59 additions & 0 deletions sdk/python/feast/infra/common/materialization_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import enum
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from typing import Callable, Optional, Union

from tqdm import tqdm

from feast import BatchFeatureView, FeatureView, StreamFeatureView


@dataclass
class MaterializationTask:
"""
A MaterializationTask represents a unit of data that needs to be materialized from an
offline store to an online store.
"""

project: str
feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView]
start_time: datetime
end_time: datetime
tqdm_builder: Callable[[int], tqdm]


class MaterializationJobStatus(enum.Enum):
WAITING = 1
RUNNING = 2
AVAILABLE = 3
ERROR = 4
CANCELLING = 5
CANCELLED = 6
SUCCEEDED = 7
PAUSED = 8
RETRYING = 9


class MaterializationJob(ABC):
"""
A MaterializationJob represents an ongoing or executed process that materializes data as per the
definition of a materialization task.
"""

task: MaterializationTask

@abstractmethod
def status(self) -> MaterializationJobStatus: ...

@abstractmethod
def error(self) -> Optional[BaseException]: ...

@abstractmethod
def should_be_retried(self) -> bool: ...

@abstractmethod
def job_id(self) -> str: ...

@abstractmethod
def url(self) -> Optional[str]: ...
6 changes: 3 additions & 3 deletions sdk/python/feast/infra/compute_engines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import pyarrow as pa

from feast import RepoConfig
from feast.infra.compute_engines.dag.context import ColumnInfo, ExecutionContext
from feast.infra.compute_engines.tasks import HistoricalRetrievalTask
from feast.infra.materialization.batch_materialization_engine import (
from feast.infra.common.materialization_job import (
MaterializationJob,
MaterializationTask,
)
from feast.infra.common.retrieval_task import HistoricalRetrievalTask
from feast.infra.compute_engines.dag.context import ColumnInfo, ExecutionContext
from feast.infra.offline_stores.offline_store import OfflineStore
from feast.infra.online_stores.online_store import OnlineStore
from feast.infra.registry.registry import Registry
Expand Down
8 changes: 3 additions & 5 deletions sdk/python/feast/infra/compute_engines/feature_builder.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from abc import ABC, abstractmethod
from typing import Union

from feast import BatchFeatureView, FeatureView, StreamFeatureView
from feast.infra.common.materialization_job import MaterializationTask
from feast.infra.common.retrieval_task import HistoricalRetrievalTask
from feast.infra.compute_engines.dag.node import DAGNode
from feast.infra.compute_engines.dag.plan import ExecutionPlan
from feast.infra.compute_engines.tasks import HistoricalRetrievalTask
from feast.infra.materialization.batch_materialization_engine import MaterializationTask


class FeatureBuilder(ABC):
Expand All @@ -16,10 +15,9 @@ class FeatureBuilder(ABC):

def __init__(
self,
feature_view: Union[BatchFeatureView, StreamFeatureView, FeatureView],
task: Union[MaterializationTask, HistoricalRetrievalTask],
):
self.feature_view = feature_view
self.feature_view = task.feature_view
self.task = task
self.nodes: list[DAGNode] = []

Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pyarrow as pa

from feast.infra.compute_engines.dag.model import DAGFormat
from feast.infra.compute_engines.dag.value import DAGValue


class ArrowTableValue(DAGValue):
def __init__(self, data: pa.Table):
super().__init__(data, DAGFormat.ARROW)

def __repr__(self):
return f"ArrowTableValue(schema={self.data.schema}, rows={self.data.num_rows})"
Empty file.
79 changes: 79 additions & 0 deletions sdk/python/feast/infra/compute_engines/local/backends/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from abc import ABC, abstractmethod
from datetime import timedelta


class DataFrameBackend(ABC):
"""
Abstract interface for DataFrame operations used by the LocalComputeEngine.

This interface defines the contract for implementing pluggable DataFrame backends
such as Pandas, Polars, or DuckDB. Each backend must support core table operations
such as joins, filtering, aggregation, conversion to/from Arrow, and deduplication.

The purpose of this abstraction is to allow seamless swapping of execution backends
without changing DAGNode or ComputeEngine logic. All nodes operate on pyarrow.Table
as the standard input/output format, while the backend defines how the computation
is actually performed.

Expected implementations include:
- PandasBackend
- PolarsBackend
- DuckDBBackend (future)

Methods
-------
from_arrow(table: pa.Table) -> Any
Convert a pyarrow.Table to the backend-native DataFrame format.

to_arrow(df: Any) -> pa.Table
Convert a backend-native DataFrame to pyarrow.Table.

join(left: Any, right: Any, on: List[str], how: str) -> Any
Join two dataframes on specified keys with given join type.

groupby_agg(df: Any, group_keys: List[str], agg_ops: Dict[str, Tuple[str, str]]) -> Any
Group and aggregate the dataframe. `agg_ops` maps output column names
to (aggregation function, source column name) pairs.

filter(df: Any, expr: str) -> Any
Apply a filter expression (string-based) to the DataFrame.

to_timedelta_value(delta: timedelta) -> Any
Convert a Python timedelta object to a backend-compatible value
that can be subtracted from a timestamp column.

drop_duplicates(df: Any, keys: List[str], sort_by: List[str], ascending: bool = False) -> Any
Deduplicate the DataFrame by key columns, keeping the first row
by descending or ascending sort order.

rename_columns(df: Any, columns: Dict[str, str]) -> Any
Rename columns in the DataFrame according to the provided mapping.
"""

@abstractmethod
def columns(self, df): ...

@abstractmethod
def from_arrow(self, table): ...

@abstractmethod
def join(self, left, right, on, how): ...

@abstractmethod
def groupby_agg(self, df, group_keys, agg_ops): ...

@abstractmethod
def filter(self, df, expr): ...

@abstractmethod
def to_arrow(self, df): ...

@abstractmethod
def to_timedelta_value(self, delta: timedelta): ...

@abstractmethod
def drop_duplicates(self, df, keys, sort_by, ascending: bool = False):
pass

@abstractmethod
def rename_columns(self, df, columns: dict[str, str]): ...
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Optional

import pandas as pd
import pyarrow

from feast.infra.compute_engines.local.backends.base import DataFrameBackend
from feast.infra.compute_engines.local.backends.pandas_backend import PandasBackend


class BackendFactory:
"""
Factory class for constructing DataFrameBackend implementations based on backend name
or runtime entity_df type.
"""

@staticmethod
def from_name(name: str) -> DataFrameBackend:
if name == "pandas":
return PandasBackend()
if name == "polars":
return BackendFactory._get_polars_backend()
raise ValueError(f"Unsupported backend name: {name}")

@staticmethod
def infer_from_entity_df(entity_df) -> Optional[DataFrameBackend]:
if isinstance(entity_df, pyarrow.Table) or isinstance(entity_df, pd.DataFrame):
return PandasBackend()

if BackendFactory._is_polars(entity_df):
return BackendFactory._get_polars_backend()
return None

@staticmethod
def _is_polars(entity_df) -> bool:
try:
import polars as pl
except ImportError:
raise ImportError(
"Polars is not installed. Please install it to use Polars backend."
)
return isinstance(entity_df, pl.DataFrame)

@staticmethod
def _get_polars_backend():
from feast.infra.compute_engines.local.backends.polars_backend import (
PolarsBackend,
)

return PolarsBackend()
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from datetime import timedelta

import pandas as pd
import pyarrow as pa

from feast.infra.compute_engines.local.backends.base import DataFrameBackend


class PandasBackend(DataFrameBackend):
def columns(self, df):
return df.columns.tolist()

def from_arrow(self, table):
return table.to_pandas()

def join(self, left, right, on, how):
return left.merge(right, on=on, how=how)

def groupby_agg(self, df, group_keys, agg_ops):
return (
df.groupby(group_keys)
.agg(
**{
alias: pd.NamedAgg(column=col, aggfunc=func)
for alias, (func, col) in agg_ops.items()
}
)
.reset_index()
)

def filter(self, df, expr):
return df.query(expr)

def to_arrow(self, df):
return pa.Table.from_pandas(df)

def to_timedelta_value(self, delta: timedelta):
return pd.to_timedelta(delta)

def drop_duplicates(self, df, keys, sort_by, ascending: bool = False):
return df.sort_values(by=sort_by, ascending=ascending).drop_duplicates(
subset=keys
)

def rename_columns(self, df, columns: dict[str, str]):
return df.rename(columns=columns)
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from datetime import timedelta

import polars as pl
import pyarrow as pa

from feast.infra.compute_engines.local.backends.base import DataFrameBackend


class PolarsBackend(DataFrameBackend):
def columns(self, df):
pass

def from_arrow(self, table: pa.Table) -> pl.DataFrame:
return pl.from_arrow(table)

def to_arrow(self, df: pl.DataFrame) -> pa.Table:
return df.to_arrow()

def join(self, left: pl.DataFrame, right: pl.DataFrame, on, how) -> pl.DataFrame:
return left.join(right, on=on, how=how)

def groupby_agg(self, df: pl.DataFrame, group_keys, agg_ops) -> pl.DataFrame:
agg_exprs = [
getattr(pl.col(col), func)().alias(alias)
for alias, (func, col) in agg_ops.items()
]
return df.groupby(group_keys).agg(agg_exprs)

def filter(self, df: pl.DataFrame, expr: str) -> pl.DataFrame:
return df.filter(pl.sql_expr(expr))

def to_timedelta_value(self, delta: timedelta):
return pl.duration(milliseconds=delta.total_seconds() * 1000)

def drop_duplicates(
self,
df: pl.DataFrame,
keys: list[str],
sort_by: list[str],
ascending: bool = False,
) -> pl.DataFrame:
return df.sort(by=sort_by, descending=not ascending).unique(
subset=keys, keep="first"
)

def rename_columns(self, df: pl.DataFrame, columns: dict[str, str]) -> pl.DataFrame:
return df.rename(columns)
Loading
Loading