Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/reference/offline-stores/postgres.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ offline_store:
sslkey_path: /path/to/client-key.pem
sslcert_path: /path/to/client-cert.pem
sslrootcert_path: /path/to/server-ca.pem
entity_select_mode: temp_table
online_store:
path: data/online_store.db
```
Expand All @@ -40,6 +41,8 @@ online_store:
Note that `sslmode`, `sslkey_path`, `sslcert_path`, and `sslrootcert_path` are optional parameters.
The full set of configuration options is available in [PostgreSQLOfflineStoreConfig](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.contrib.postgres_offline_store.postgres.PostgreSQLOfflineStoreConfig).

Additionally, a new optional parameter `entity_select_mode` was added to tell how Postgres should load the entity data. By default(`temp_table`), a temporary table is created and the entity data frame or sql is loaded into that table. A new value of `embed_query` was added to allow directly loading the SQL query into a CTE, providing improved performance and skipping the need to CREATE and DROP the temporary table.

## Functionality Matrix

The set of functionality supported by offline stores is described in detail [here](overview.md#functionality).
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import contextlib
from dataclasses import asdict
from datetime import datetime, timezone
from enum import Enum
from typing import (
Any,
Callable,
Expand Down Expand Up @@ -48,8 +49,16 @@
from .postgres_source import PostgreSQLSource


class EntitySelectMode(Enum):
temp_table = "temp_table"
""" Use a temporary table to store the entity DataFrame or SQL query when querying feature data """
embed_query = "embed_query"
""" Use the entity SQL query directly when querying feature data """


class PostgreSQLOfflineStoreConfig(PostgreSQLConfig):
type: Literal["postgres"] = "postgres"
entity_select_mode: EntitySelectMode = EntitySelectMode.temp_table


class PostgreSQLOfflineStore(OfflineStore):
Expand Down Expand Up @@ -134,7 +143,17 @@ def get_historical_features(
def query_generator() -> Iterator[str]:
table_name = offline_utils.get_temp_entity_table_name()

_upload_entity_df(config, entity_df, table_name)
# If using CTE and entity_df is a SQL query, we don't need a table
if config.offline_store.entity_select_mode == EntitySelectMode.embed_query:
if isinstance(entity_df, str):
left_table_query_string = entity_df
else:
raise ValueError(
f"Invalid entity select mode: {config.offline_store.entity_select_mode} cannot be used with entity_df as a DataFrame"
)
else:
left_table_query_string = table_name
_upload_entity_df(config, entity_df, table_name)

expected_join_keys = offline_utils.get_expected_join_keys(
project, feature_views, registry
Expand Down Expand Up @@ -163,14 +182,19 @@ def query_generator() -> Iterator[str]:
try:
yield build_point_in_time_query(
query_context_dict,
left_table_query_string=table_name,
left_table_query_string=left_table_query_string,
entity_df_event_timestamp_col=entity_df_event_timestamp_col,
entity_df_columns=entity_schema.keys(),
query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN,
full_feature_names=full_feature_names,
entity_select_mode=config.offline_store.entity_select_mode,
)
finally:
if table_name:
# Only cleanup if we created a table
if (
config.offline_store.entity_select_mode
== EntitySelectMode.temp_table
):
with _get_conn(config.offline_store) as conn, conn.cursor() as cur:
cur.execute(
sql.SQL(
Expand Down Expand Up @@ -362,6 +386,7 @@ def build_point_in_time_query(
entity_df_columns: KeysView[str],
query_template: str,
full_feature_names: bool = False,
entity_select_mode: EntitySelectMode = EntitySelectMode.temp_table,
) -> str:
"""Build point-in-time query between each feature view table and the entity dataframe for PostgreSQL"""
template = Environment(loader=BaseLoader()).from_string(source=query_template)
Expand Down Expand Up @@ -389,6 +414,7 @@ def build_point_in_time_query(
"featureviews": feature_view_query_contexts,
"full_feature_names": full_feature_names,
"final_output_feature_names": final_output_feature_names,
"entity_select_mode": entity_select_mode.value,
}

query = template.render(template_context)
Expand Down Expand Up @@ -429,11 +455,15 @@ def _get_entity_schema(
# https://github.com/feast-dev/feast/blob/master/sdk/python/feast/infra/offline_stores/redshift.py

MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """
WITH
{% if entity_select_mode == "embed_query" %}
entity_query AS ({{ left_table_query_string }}),
{% endif %}
/*
Compute a deterministic hash for the `left_table_query_string` that will be used throughout
all the logic as the field to GROUP BY the data
*/
WITH entity_dataframe AS (
entity_dataframe AS (
SELECT *,
{{entity_df_event_timestamp_col}} AS entity_timestamp
{% for featureview in featureviews %}
Expand All @@ -448,7 +478,12 @@ def _get_entity_schema(
,CAST("{{entity_df_event_timestamp_col}}" AS VARCHAR) AS "{{featureview.name}}__entity_row_unique_id"
{% endif %}
{% endfor %}
FROM {{ left_table_query_string }}
FROM
{% if entity_select_mode == "embed_query" %}
entity_query
{% else %}
{{ left_table_query_string }}
{% endif %}
),

{% for featureview in featureviews %}
Expand Down
Loading
Loading