Skip to content

Commit fe69eaf

Browse files
authored
feat: Enable users to use Entity Query as CTE during historical retrieval (#5202)
* entity query as cte Signed-off-by: Blake <blaketastic2@gmail.com> * formatting Signed-off-by: Blake <blaketastic2@gmail.com> * backward compatible Signed-off-by: Blake <blaketastic2@gmail.com> * updated docs Signed-off-by: Blake <blaketastic2@gmail.com> * jinja2 template fix Signed-off-by: Blake <blaketastic2@gmail.com> * test data Signed-off-by: Blake <blaketastic2@gmail.com> --------- Signed-off-by: Blake <blaketastic2@gmail.com>
1 parent 972ed34 commit fe69eaf

File tree

3 files changed

+433
-5
lines changed
  • docs/reference/offline-stores
  • sdk/python
    • feast/infra/offline_stores/contrib/postgres_offline_store
    • tests/unit/infra/offline_stores/contrib/postgres_offline_store

3 files changed

+433
-5
lines changed

docs/reference/offline-stores/postgres.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ offline_store:
3232
sslkey_path: /path/to/client-key.pem
3333
sslcert_path: /path/to/client-cert.pem
3434
sslrootcert_path: /path/to/server-ca.pem
35+
entity_select_mode: temp_table
3536
online_store:
3637
path: data/online_store.db
3738
```
@@ -40,6 +41,8 @@ online_store:
4041
Note that `sslmode`, `sslkey_path`, `sslcert_path`, and `sslrootcert_path` are optional parameters.
4142
The full set of configuration options is available in [PostgreSQLOfflineStoreConfig](https://rtd.feast.dev/en/master/#feast.infra.offline_stores.contrib.postgres_offline_store.postgres.PostgreSQLOfflineStoreConfig).
4243

44+
Additionally, a new optional parameter `entity_select_mode` was added to tell how Postgres should load the entity data. By default(`temp_table`), a temporary table is created and the entity data frame or sql is loaded into that table. A new value of `embed_query` was added to allow directly loading the SQL query into a CTE, providing improved performance and skipping the need to CREATE and DROP the temporary table.
45+
4346
## Functionality Matrix
4447

4548
The set of functionality supported by offline stores is described in detail [here](overview.md#functionality).

sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/postgres.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import contextlib
22
from dataclasses import asdict
33
from datetime import datetime, timezone
4+
from enum import Enum
45
from typing import (
56
Any,
67
Callable,
@@ -48,8 +49,16 @@
4849
from .postgres_source import PostgreSQLSource
4950

5051

52+
class EntitySelectMode(Enum):
53+
temp_table = "temp_table"
54+
""" Use a temporary table to store the entity DataFrame or SQL query when querying feature data """
55+
embed_query = "embed_query"
56+
""" Use the entity SQL query directly when querying feature data """
57+
58+
5159
class PostgreSQLOfflineStoreConfig(PostgreSQLConfig):
5260
type: Literal["postgres"] = "postgres"
61+
entity_select_mode: EntitySelectMode = EntitySelectMode.temp_table
5362

5463

5564
class PostgreSQLOfflineStore(OfflineStore):
@@ -134,7 +143,17 @@ def get_historical_features(
134143
def query_generator() -> Iterator[str]:
135144
table_name = offline_utils.get_temp_entity_table_name()
136145

137-
_upload_entity_df(config, entity_df, table_name)
146+
# If using CTE and entity_df is a SQL query, we don't need a table
147+
if config.offline_store.entity_select_mode == EntitySelectMode.embed_query:
148+
if isinstance(entity_df, str):
149+
left_table_query_string = entity_df
150+
else:
151+
raise ValueError(
152+
f"Invalid entity select mode: {config.offline_store.entity_select_mode} cannot be used with entity_df as a DataFrame"
153+
)
154+
else:
155+
left_table_query_string = table_name
156+
_upload_entity_df(config, entity_df, table_name)
138157

139158
expected_join_keys = offline_utils.get_expected_join_keys(
140159
project, feature_views, registry
@@ -163,14 +182,19 @@ def query_generator() -> Iterator[str]:
163182
try:
164183
yield build_point_in_time_query(
165184
query_context_dict,
166-
left_table_query_string=table_name,
185+
left_table_query_string=left_table_query_string,
167186
entity_df_event_timestamp_col=entity_df_event_timestamp_col,
168187
entity_df_columns=entity_schema.keys(),
169188
query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN,
170189
full_feature_names=full_feature_names,
190+
entity_select_mode=config.offline_store.entity_select_mode,
171191
)
172192
finally:
173-
if table_name:
193+
# Only cleanup if we created a table
194+
if (
195+
config.offline_store.entity_select_mode
196+
== EntitySelectMode.temp_table
197+
):
174198
with _get_conn(config.offline_store) as conn, conn.cursor() as cur:
175199
cur.execute(
176200
sql.SQL(
@@ -362,6 +386,7 @@ def build_point_in_time_query(
362386
entity_df_columns: KeysView[str],
363387
query_template: str,
364388
full_feature_names: bool = False,
389+
entity_select_mode: EntitySelectMode = EntitySelectMode.temp_table,
365390
) -> str:
366391
"""Build point-in-time query between each feature view table and the entity dataframe for PostgreSQL"""
367392
template = Environment(loader=BaseLoader()).from_string(source=query_template)
@@ -389,6 +414,7 @@ def build_point_in_time_query(
389414
"featureviews": feature_view_query_contexts,
390415
"full_feature_names": full_feature_names,
391416
"final_output_feature_names": final_output_feature_names,
417+
"entity_select_mode": entity_select_mode.value,
392418
}
393419

394420
query = template.render(template_context)
@@ -429,11 +455,15 @@ def _get_entity_schema(
429455
# https://github.com/feast-dev/feast/blob/master/sdk/python/feast/infra/offline_stores/redshift.py
430456

431457
MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """
458+
WITH
459+
{% if entity_select_mode == "embed_query" %}
460+
entity_query AS ({{ left_table_query_string }}),
461+
{% endif %}
432462
/*
433463
Compute a deterministic hash for the `left_table_query_string` that will be used throughout
434464
all the logic as the field to GROUP BY the data
435465
*/
436-
WITH entity_dataframe AS (
466+
entity_dataframe AS (
437467
SELECT *,
438468
{{entity_df_event_timestamp_col}} AS entity_timestamp
439469
{% for featureview in featureviews %}
@@ -448,7 +478,12 @@ def _get_entity_schema(
448478
,CAST("{{entity_df_event_timestamp_col}}" AS VARCHAR) AS "{{featureview.name}}__entity_row_unique_id"
449479
{% endif %}
450480
{% endfor %}
451-
FROM {{ left_table_query_string }}
481+
FROM
482+
{% if entity_select_mode == "embed_query" %}
483+
entity_query
484+
{% else %}
485+
{{ left_table_query_string }}
486+
{% endif %}
452487
),
453488
454489
{% for featureview in featureviews %}

0 commit comments

Comments
 (0)