Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions protos/feast/core/DataSource.proto
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ option java_outer_classname = "DataSourceProto";
option java_package = "feast.proto.core";

import "feast/core/DataFormat.proto";
import "feast/types/Value.proto";

// Defines a Data Source that can be used source Feature data
message DataSource {
Expand All @@ -39,6 +40,7 @@ message DataSource {
STREAM_KINESIS = 4;
BATCH_REDSHIFT = 5;
CUSTOM_SOURCE = 6;
REQUEST_SOURCE = 7;
}
SourceType type = 1;

Expand Down Expand Up @@ -133,13 +135,23 @@ message DataSource {
bytes configuration = 1;
}

// Defines options for DataSource that sources features from request data
message RequestDataOptions {
// Name of the request data source
string name = 1;

// Mapping of feature name to type
map<string, feast.types.ValueType.Enum> schema = 2;
}

// DataSource options.
oneof options {
FileOptions file_options = 11;
BigQueryOptions bigquery_options = 12;
KafkaOptions kafka_options = 13;
KinesisOptions kinesis_options = 14;
RedshiftOptions redshift_options = 15;
RequestDataOptions request_data_options = 18;
CustomSourceOptions custom_options = 16;
}
}
11 changes: 9 additions & 2 deletions protos/feast/core/OnDemandFeatureView.proto
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ option java_package = "feast.proto.core";

import "feast/core/FeatureView.proto";
import "feast/core/Feature.proto";
import "feast/core/DataSource.proto";

message OnDemandFeatureView {
// User-specified specifications of this feature view.
Expand All @@ -41,12 +42,18 @@ message OnDemandFeatureViewSpec {
repeated FeatureSpecV2 features = 3;

// List of features specifications for each feature defined with this feature view.
// TODO(adchia): add support for request data
map<string, FeatureView> inputs = 4;
map<string, OnDemandInput> inputs = 4;

UserDefinedFunction user_defined_function = 5;
}

message OnDemandInput {
oneof input {
FeatureView feature_view = 1;
DataSource request_data_source = 2;
}
}

// Serialized representation of python function.
message UserDefinedFunction {
// The function name
Expand Down
68 changes: 68 additions & 0 deletions sdk/python/feast/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,74 @@ def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]:
return type_map.redshift_to_feast_value_type


class RequestDataSource(DataSource):
"""
RequestDataSource that can be used to provide input features for on demand transforms

Args:
name: Name of the request data source
schema: Schema mapping from the input feature name to a ValueType
"""

@staticmethod
def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]:
raise NotImplementedError
Comment on lines +524 to +526

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I wonder if this method makes sense for this type of data source. We should probably refactor the DataSource interface to make it simpler and introduce a BatchDataSource somewhere in the type hierarchy.


_name: str
_schema: Dict[str, ValueType]

def __init__(
self, name: str, schema: Dict[str, ValueType],
):
"""Creates a RequestDataSource object."""
super().__init__()
self._name = name
self._schema = schema

@property
def name(self) -> str:
"""
Returns the name of this data source
"""
return self._name

@property
def schema(self) -> Dict[str, ValueType]:
"""
Returns the schema for this request data source
"""
return self._schema

def validate(self, config: RepoConfig):
pass

def get_table_column_names_and_types(
self, config: RepoConfig
) -> Iterable[Tuple[str, str]]:
pass

@staticmethod
def from_proto(data_source: DataSourceProto):
schema_pb = data_source.request_data_options.schema
schema = {}
for key in schema_pb.keys():
schema[key] = ValueType(schema_pb.get(key))
return RequestDataSource(
name=data_source.request_data_options.name, schema=schema
)

def to_proto(self) -> DataSourceProto:
schema_pb = {}
for key, value in self._schema.items():
schema_pb[key] = value.value
options = DataSourceProto.RequestDataOptions(name=self._name, schema=schema_pb)
data_source_proto = DataSourceProto(
type=DataSourceProto.REQUEST_SOURCE, request_data_options=options
)

return data_source_proto


class KinesisSource(DataSource):
def validate(self, config: RepoConfig):
pass
Expand Down
14 changes: 14 additions & 0 deletions sdk/python/feast/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@ def __init__(self, name, project=None):
super().__init__(f"On demand feature view {name} does not exist")


class RequestDataNotFoundInEntityDfException(FeastObjectNotFoundException):
def __init__(self, feature_name, feature_view_name):
super().__init__(
f"Feature {feature_name} not found in the entity dataframe, but required by on demand feature view {feature_view_name}"
)


class RequestDataNotFoundInEntityRowsException(FeastObjectNotFoundException):
def __init__(self, feature_names):
super().__init__(
f"Required request data source features {feature_names} not found in the entity rows, but required by on demand feature views"
)


class FeatureTableNotFoundException(FeastObjectNotFoundException):
def __init__(self, name, project=None):
if project:
Expand Down
77 changes: 71 additions & 6 deletions sdk/python/feast/feature_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,21 @@
from collections import Counter, OrderedDict, defaultdict
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union, cast

import pandas as pd
from colorama import Fore, Style
from tqdm import tqdm

from feast import feature_server, utils
from feast.data_source import RequestDataSource
from feast.entity import Entity
from feast.errors import (
EntityNotFoundException,
FeatureNameCollisionError,
FeatureViewNotFoundException,
RequestDataNotFoundInEntityDfException,
RequestDataNotFoundInEntityRowsException,
)
from feast.feature_service import FeatureService
from feast.feature_table import FeatureTable
Expand Down Expand Up @@ -402,7 +405,7 @@ def apply(
view.infer_features_from_batch_source(self.config)

for odfv in odfvs_to_update:
odfv.infer_features_from_batch_source(self.config)
odfv.infer_features()

if len(views_to_update) + len(entities_to_update) + len(
services_to_update
Expand Down Expand Up @@ -545,10 +548,26 @@ def get_historical_features(
# TODO(achal): _group_feature_refs returns the on demand feature views, but it's no passed into the provider.
# This is a weird interface quirk - we should revisit the `get_historical_features` to
# pass in the on demand feature views as well.
fvs, _ = _group_feature_refs(
fvs, odfvs = _group_feature_refs(
_feature_refs, all_feature_views, all_on_demand_feature_views
)
feature_views = list(view for view, _ in fvs)
on_demand_feature_views = list(view for view, _ in odfvs)

# Check that the right request data is present in the entity_df
if type(entity_df) == pd.DataFrame:
entity_pd_df = cast(pd.DataFrame, entity_df)
for odfv in on_demand_feature_views:
odfv_inputs = odfv.inputs.values()
for odfv_input in odfv_inputs:
if type(odfv_input) == RequestDataSource:
request_data_source = cast(RequestDataSource, odfv_input)
for feature_name in request_data_source.schema.keys():
if feature_name not in entity_pd_df.columns:
raise RequestDataNotFoundInEntityDfException(
feature_name=feature_name,
feature_view_name=odfv.name,
)

_validate_feature_refs(_feature_refs, full_feature_names)

Expand Down Expand Up @@ -789,7 +808,7 @@ def get_online_features(
)

_validate_feature_refs(_feature_refs, full_feature_names)
grouped_refs, _ = _group_feature_refs(
grouped_refs, grouped_odfv_refs = _group_feature_refs(
_feature_refs, all_feature_views, all_on_demand_feature_views
)
feature_views = list(view for view, _ in grouped_refs)
Expand All @@ -805,28 +824,61 @@ def get_online_features(
for entity in entities:
entity_name_to_join_key_map[entity.name] = entity.join_key

needed_request_data_features = self._get_needed_request_data_features(
grouped_odfv_refs
)

join_key_rows = []
request_data_features: Dict[str, List[Any]] = {}
# Entity rows may be either entities or request data.
for row in entity_rows:
join_key_row = {}
for entity_name, entity_value in row.items():
# Found request data
if entity_name in needed_request_data_features:
if entity_name not in request_data_features:
request_data_features[entity_name] = []
request_data_features[entity_name].append(entity_value)
continue
try:
join_key = entity_name_to_join_key_map[entity_name]
except KeyError:
raise EntityNotFoundException(entity_name, self.project)
join_key_row[join_key] = entity_value
if entityless_case:
join_key_row[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL
join_key_rows.append(join_key_row)
if len(join_key_row) > 0:
# May be empty if this entity row was request data
join_key_rows.append(join_key_row)

if len(needed_request_data_features) != len(request_data_features.keys()):
raise RequestDataNotFoundInEntityRowsException(
feature_names=needed_request_data_features
)
Comment thread
adchia marked this conversation as resolved.
Outdated

entity_row_proto_list = _infer_online_entity_rows(join_key_rows)

union_of_entity_keys = []
union_of_entity_keys: List[EntityKeyProto] = []
result_rows: List[GetOnlineFeaturesResponse.FieldValues] = []

for entity_row_proto in entity_row_proto_list:
# Create a list of entity keys to filter down for each feature view at lookup time.
union_of_entity_keys.append(_entity_row_to_key(entity_row_proto))
# Also create entity values to append to the result
result_rows.append(_entity_row_to_field_values(entity_row_proto))

# Add more feature values to the existing result rows for the request data features
for feature_name, feature_values in request_data_features.items():
for row_idx, feature_value in enumerate(feature_values):
result_row = result_rows[row_idx]
result_row.fields[feature_name].CopyFrom(
python_value_to_proto_value(feature_value)
)
result_row.statuses[
feature_name
] = GetOnlineFeaturesResponse.FieldStatus.PRESENT

Comment thread
achals marked this conversation as resolved.
Outdated
# Note: each "table" is a feature view
for table, requested_features in grouped_refs:
entity_keys = _get_table_entity_keys(
table, union_of_entity_keys, entity_name_to_join_key_map
Expand All @@ -837,6 +889,7 @@ def get_online_features(
entity_keys=entity_keys,
requested_features=requested_features,
)
# Each row is a set of features for a given entity key
for row_idx, read_row in enumerate(read_rows):
row_ts, feature_data = read_row
result_row = result_rows[row_idx]
Expand Down Expand Up @@ -873,6 +926,18 @@ def get_online_features(
_feature_refs, full_feature_names, initial_response, result_rows
)

def _get_needed_request_data_features(self, grouped_odfv_refs) -> Set[str]:
needed_request_data_features = set()
for odfv_to_feature_names in grouped_odfv_refs:
odfv, requested_feature_names = odfv_to_feature_names
odfv_inputs = odfv.inputs.values()
for odfv_input in odfv_inputs:
if type(odfv_input) == RequestDataSource:
request_data_source = cast(RequestDataSource, odfv_input)
for feature_name in request_data_source.schema.keys():
needed_request_data_features.add(feature_name)
return needed_request_data_features

def _augment_response_with_on_demand_transforms(
self,
feature_refs: List[str],
Expand Down
4 changes: 2 additions & 2 deletions sdk/python/feast/infra/offline_stores/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ def get_historical_features(
client=client,
config=config,
full_feature_names=full_feature_names,
on_demand_feature_views=registry.list_on_demand_feature_views(
project, allow_cache=True
on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs(
feature_refs, project, registry
),
)

Expand Down
4 changes: 2 additions & 2 deletions sdk/python/feast/infra/offline_stores/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,8 @@ def evaluate_historical_retrieval():
job = FileRetrievalJob(
evaluation_function=evaluate_historical_retrieval,
full_feature_names=full_feature_names,
on_demand_feature_views=registry.list_on_demand_feature_views(
project, allow_cache=True
on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs(
feature_refs, project, registry
),
)
return job
Expand Down
4 changes: 2 additions & 2 deletions sdk/python/feast/infra/offline_stores/redshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ def query_generator() -> Iterator[str]:
s3_resource=s3_resource,
config=config,
full_feature_names=full_feature_names,
on_demand_feature_views=registry.list_on_demand_feature_views(
project=project, allow_cache=True
on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs(
feature_refs, project, registry
),
drop_columns=["entity_timestamp"]
+ [
Expand Down
Loading