-
Notifications
You must be signed in to change notification settings - Fork 1.3k
perf(postgres): Optimizing feast offline Store for date-range multi-FV retrieval #6057
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Vperiodt
wants to merge
16
commits into
feast-dev:master
Choose a base branch
from
Vperiodt:patch-query
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
d1fe30e
Optimizing Date-Range Queries
Vperiodt e7e08fd
Merge branch 'feast-dev:master' into patch-query
Vperiodt 4bde7dd
fix issues
Vperiodt 02e846b
fix lint
Vperiodt cdf91f9
Merge branch 'feast-dev:master' into patch-query
Vperiodt acae663
Merge branch 'feast-dev:master' into patch-query
Vperiodt 0794b11
featureview entities
Vperiodt 4383dba
fix LOCF spine UNION and empty-entity SQL
Vperiodt a6fbf82
Merge branch 'feast-dev:master' into patch-query
Vperiodt 8ee52cd
Merge branch 'feast-dev:master' into patch-query
Vperiodt 4980d80
added fixes
Vperiodt cf3c756
updates test
Vperiodt ae29d2a
fix entity_df range
Vperiodt 7644e10
Merge branch 'feast-dev:master' into patch-query
Vperiodt 84c1314
updated postgres.py
Vperiodt d96cee1
Merge branch 'master' into patch-query
Vperiodt File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -139,17 +139,18 @@ def get_historical_features( | |
| end_date = _utc_now() | ||
| else: | ||
| end_date = make_tzaware(end_date) | ||
| # Find the maximum TTL across all feature views to ensure we capture enough data | ||
| max_ttl_seconds = max( | ||
| ( | ||
| int(fv.ttl.total_seconds()) | ||
| for fv in feature_views | ||
| if fv.ttl and isinstance(fv.ttl, timedelta) | ||
| ), | ||
| default=0, | ||
| ) | ||
|
|
||
| # Calculate start_date from TTL if not provided | ||
|
|
||
| if start_date is None: | ||
| # Find the maximum TTL across all feature views to ensure we capture enough data | ||
| max_ttl_seconds = 0 | ||
| for fv in feature_views: | ||
| if fv.ttl and isinstance(fv.ttl, timedelta): | ||
| ttl_seconds = int(fv.ttl.total_seconds()) | ||
| max_ttl_seconds = max(max_ttl_seconds, ttl_seconds) | ||
|
|
||
| if max_ttl_seconds > 0: | ||
| # Start from (end_date - max_ttl) to ensure we capture all relevant features | ||
| start_date = end_date - timedelta(seconds=max_ttl_seconds) | ||
|
|
@@ -161,6 +162,13 @@ def get_historical_features( | |
|
|
||
| entity_df = pd.DataFrame({"event_timestamp": [end_date]}) | ||
|
|
||
| # Single row with end_date | ||
| entity_df = pd.DataFrame({"event_timestamp": [end_date]}) | ||
| skip_entity_upload = True | ||
| else: | ||
| lookback_start_date = None | ||
| skip_entity_upload = False | ||
|
|
||
| entity_schema = _get_entity_schema(entity_df, config) | ||
|
|
||
| entity_df_event_timestamp_col = ( | ||
|
|
@@ -190,7 +198,12 @@ def query_generator() -> Iterator[str]: | |
| and config.offline_store.entity_select_mode | ||
| == EntitySelectMode.embed_query | ||
| ) | ||
| if use_cte: | ||
| if skip_entity_upload: | ||
| # LOCF path never uses left_table | ||
| left_table_query_string = ( | ||
| "(SELECT NULL::timestamptz AS event_timestamp LIMIT 0)" | ||
| ) | ||
| elif use_cte: | ||
| left_table_query_string = entity_df | ||
| else: | ||
| left_table_query_string = table_name | ||
|
|
@@ -231,11 +244,13 @@ def query_generator() -> Iterator[str]: | |
| use_cte=use_cte, | ||
| start_date=start_date, | ||
| end_date=end_date, | ||
| lookback_start_date=lookback_start_date, | ||
| ) | ||
| finally: | ||
| # Only cleanup if we created a table | ||
| # Only cleanup if we created a table (not when skip_entity_upload) | ||
| if ( | ||
| config.offline_store.entity_select_mode | ||
| not skip_entity_upload | ||
| and config.offline_store.entity_select_mode | ||
| == EntitySelectMode.temp_table | ||
| ): | ||
| with _get_conn(config.offline_store) as conn, conn.cursor() as cur: | ||
|
|
@@ -445,6 +460,7 @@ def build_point_in_time_query( | |
| use_cte: bool = False, | ||
| start_date: Optional[datetime] = None, | ||
| end_date: Optional[datetime] = None, | ||
| lookback_start_date: Optional[datetime] = None, | ||
| ) -> str: | ||
| """Build point-in-time query between each feature view table and the entity dataframe for PostgreSQL""" | ||
| template = Environment(loader=BaseLoader()).from_string(source=query_template) | ||
|
|
@@ -475,6 +491,7 @@ def build_point_in_time_query( | |
| "use_cte": use_cte, | ||
| "start_date": start_date, | ||
| "end_date": end_date, | ||
| "lookback_start_date": lookback_start_date, | ||
| } | ||
|
|
||
| query = template.render(template_context) | ||
|
|
@@ -537,89 +554,173 @@ def _get_entity_schema( | |
| AND "{{ featureviews[0].timestamp_field }}" >= '{{ featureviews[0].min_event_timestamp }}' | ||
| {% endif %} | ||
| {% else %} | ||
| {# --- LOCF (Last Observation Carried Forward) path for multi-FV date-range --- #} | ||
|
|
||
| {# Collect deduplicated entity list across all FVs #} | ||
| {% set all_entities = [] %} | ||
| {% for featureview in featureviews %} | ||
| {% for entity in featureview.entities %} | ||
| {% if entity not in all_entities %} | ||
| {% set _ = all_entities.append(entity) %} | ||
| {% endif %} | ||
| {% endfor %} | ||
| {% endfor %} | ||
|
|
||
| WITH | ||
| {# --- Per-FV __data: pull feature rows from lookback_start_date..end_date --- #} | ||
| {% for featureview in featureviews %} | ||
| "{{ featureview.name }}__data" AS ( | ||
| "{{ featureview.name }}__data_raw" AS ( | ||
| SELECT | ||
| "{{ featureview.timestamp_field }}" AS event_timestamp, | ||
| {% if featureview.created_timestamp_column %} | ||
| "{{ featureview.created_timestamp_column }}" AS created_timestamp, | ||
| {% endif %} | ||
| {% for entity in featureview.entities %} | ||
| "{{ entity }}", | ||
| {% endfor %} | ||
| {% for feature in featureview.features %} | ||
| "{{ feature }}" AS {% if full_feature_names %}"{{ featureview.name }}__{{ featureview.field_mapping.get(feature, feature) }}"{% else %}"{{ featureview.field_mapping.get(feature, feature) }}"{% endif %}{% if not loop.last %},{% endif %} | ||
| "{{ feature }}" AS "{% if full_feature_names %}{{ featureview.name }}__{{ featureview.field_mapping.get(feature, feature) }}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}"{% if not loop.last %},{% endif %} | ||
| {% endfor %} | ||
| {% if featureview.created_timestamp_column %} | ||
| , "{{ featureview.created_timestamp_column }}" AS created_timestamp | ||
| {% endif %} | ||
| FROM {{ featureview.table_subquery }} AS sub | ||
| WHERE "{{ featureview.timestamp_field }}" BETWEEN '{{ start_date }}' AND '{{ end_date }}' | ||
| {% if featureview.ttl != 0 and featureview.min_event_timestamp %} | ||
| AND "{{ featureview.timestamp_field }}" >= '{{ featureview.min_event_timestamp }}' | ||
| {% endif %} | ||
| WHERE "{{ featureview.timestamp_field }}" BETWEEN '{{ lookback_start_date or start_date }}' AND '{{ end_date }}' | ||
| ), | ||
| {% if featureview.created_timestamp_column %} | ||
| "{{ featureview.name }}__data" AS ( | ||
| SELECT * FROM ( | ||
| SELECT *, | ||
| ROW_NUMBER() OVER ( | ||
| PARTITION BY {% for entity in featureview.entities %}"{{ entity }}", {% endfor %}event_timestamp | ||
| ORDER BY created_timestamp DESC | ||
| ) AS __rn | ||
| FROM "{{ featureview.name }}__data_raw" | ||
| ) __dedup WHERE __rn = 1 | ||
| ), | ||
| {% endif %} | ||
| {% endfor %} | ||
|
|
||
| -- Create a base query with all unique entity + timestamp combinations | ||
| base_entities AS ( | ||
| {# --- Spine: prediction timeline = distinct (entity, timestamp) in [start_date, end_date] --- #} | ||
| {# Each UNION branch selects all_entities so column count/order match; NULL for entities not in this FV. #} | ||
| spine AS ( | ||
| {% for featureview in featureviews %} | ||
| SELECT DISTINCT | ||
| event_timestamp, | ||
| {% for entity in featureview.entities %} | ||
| "{{ entity }}"{% if not loop.last %},{% endif %} | ||
| {% endfor %} | ||
| FROM "{{ featureview.name }}__data" | ||
| d.event_timestamp{% for entity in all_entities %}, | ||
| {% if entity in featureview.entities %}d."{{ entity }}"{% else %}NULL AS "{{ entity }}"{% endif %}{% endfor %} | ||
| FROM "{{ featureview.name }}__data{% if not featureview.created_timestamp_column %}_raw{% endif %}" d | ||
| WHERE d.event_timestamp BETWEEN '{{ start_date }}' AND '{{ end_date }}' | ||
| {% if not loop.last %} | ||
| UNION | ||
| {% endif %} | ||
| {% endfor %} | ||
| ) | ||
| ), | ||
|
|
||
| SELECT | ||
| base.event_timestamp, | ||
| {% set all_entities = [] %} | ||
| {% for featureview in featureviews %} | ||
| {% for entity in featureview.entities %} | ||
| {% if entity not in all_entities %} | ||
| {% set _ = all_entities.append(entity) %} | ||
| {# --- Per-FV independent LOCF forward-fill pipelines --- #} | ||
| {# Each FV is stacked, grouped, and filled independently to prevent #} | ||
| {# cross-FV interference when multiple FVs share overlapping timestamps. #} | ||
| {% for featureview in featureviews %} | ||
| "{{ featureview.name }}__stacked" AS ( | ||
| SELECT | ||
| s.event_timestamp{% for entity in all_entities %}, | ||
| s."{{ entity }}"{% endfor %}, | ||
| 1 AS is_spine, | ||
| NULL::int AS feature_anchor | ||
| {% for feature in featureview.features %} | ||
| {% if full_feature_names %} | ||
| {% set col_name = featureview.name ~ '__' ~ featureview.field_mapping.get(feature, feature) %} | ||
| {% else %} | ||
| {% set col_name = featureview.field_mapping.get(feature, feature) %} | ||
| {% endif %} | ||
| , NULL AS "{{ col_name }}" | ||
| {% endfor %} | ||
| {% endfor %} | ||
| {% for entity in all_entities %} | ||
| base."{{ entity }}", | ||
| {% endfor %} | ||
| , NULL::timestamptz AS "{{ featureview.name }}__feature_ts" | ||
| FROM spine s | ||
|
|
||
| UNION ALL | ||
|
|
||
| SELECT | ||
| d.event_timestamp{% for entity in all_entities %}, | ||
| {% if entity in featureview.entities %}d."{{ entity }}"{% else %}NULL AS "{{ entity }}"{% endif %}{% endfor %}, | ||
| 0 AS is_spine, | ||
| 1 AS feature_anchor | ||
| {% for feature in featureview.features %} | ||
| {% if full_feature_names %} | ||
| {% set col_name = featureview.name ~ '__' ~ featureview.field_mapping.get(feature, feature) %} | ||
| {% else %} | ||
| {% set col_name = featureview.field_mapping.get(feature, feature) %} | ||
| {% endif %} | ||
| , d."{{ col_name }}" | ||
| {% endfor %} | ||
| , d.event_timestamp AS "{{ featureview.name }}__feature_ts" | ||
| FROM "{{ featureview.name }}__data{% if not featureview.created_timestamp_column %}_raw{% endif %}" d | ||
| ), | ||
|
|
||
| "{{ featureview.name }}__grouped" AS ( | ||
| SELECT *, | ||
| COUNT(feature_anchor) OVER ( | ||
| PARTITION BY {% if featureview.entities %}{% for entity in featureview.entities %}"{{ entity }}"{% if not loop.last %}, {% endif %}{% endfor %}{% else %}(SELECT NULL){% endif %} | ||
| ORDER BY event_timestamp ASC, is_spine ASC | ||
| ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | ||
| ) AS value_group_id | ||
| FROM "{{ featureview.name }}__stacked" | ||
devin-ai-integration[bot] marked this conversation as resolved.
Show resolved
Hide resolved
devin-ai-integration[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ), | ||
|
|
||
| "{{ featureview.name }}__filled" AS ( | ||
| SELECT | ||
| event_timestamp{% for entity in all_entities %}, | ||
| "{{ entity }}"{% endfor %}, | ||
| is_spine | ||
| {% for feature in featureview.features %} | ||
| {% if full_feature_names %} | ||
| {% set col_name = featureview.name ~ '__' ~ featureview.field_mapping.get(feature, feature) %} | ||
| {% else %} | ||
| {% set col_name = featureview.field_mapping.get(feature, feature) %} | ||
| {% endif %} | ||
| , FIRST_VALUE("{{ col_name }}") OVER ( | ||
| PARTITION BY {% if featureview.entities %}{% for entity in featureview.entities %}"{{ entity }}"{% if not loop.last %}, {% endif %}{% endfor %}, {% endif %}value_group_id | ||
| ORDER BY event_timestamp ASC, is_spine ASC | ||
| ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | ||
| ) AS "{{ col_name }}" | ||
| {% endfor %} | ||
| , FIRST_VALUE("{{ featureview.name }}__feature_ts") OVER ( | ||
| PARTITION BY {% if featureview.entities %}{% for entity in featureview.entities %}"{{ entity }}"{% if not loop.last %}, {% endif %}{% endfor %}, {% endif %}value_group_id | ||
| ORDER BY event_timestamp ASC, is_spine ASC | ||
| ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | ||
| ) AS "{{ featureview.name }}__filled_ts" | ||
| FROM "{{ featureview.name }}__grouped" | ||
| ){% if not loop.last %},{% endif %} | ||
| {% endfor %} | ||
|
|
||
| {# --- Final: join per-FV filled results back onto spine --- #} | ||
| SELECT | ||
| spine.event_timestamp{% for entity in all_entities %}, | ||
| spine."{{ entity }}"{% endfor %}, | ||
| {% set total_features = featureviews|map(attribute='features')|map('length')|sum %} | ||
| {% set feature_counter = namespace(count=0) %} | ||
| {% set feat_idx = namespace(count=0) %} | ||
| {% for featureview in featureviews %} | ||
| {% set outer_loop_index = loop.index0 %} | ||
| {% for feature in featureview.features %} | ||
| {% set feature_counter.count = feature_counter.count + 1 %} | ||
| fv_{{ outer_loop_index }}."{% if full_feature_names %}{{ featureview.name }}__{{ featureview.field_mapping.get(feature, feature) }}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}"{% if feature_counter.count < total_features %},{% endif %} | ||
| {% set feat_idx.count = feat_idx.count + 1 %} | ||
| {% if full_feature_names %} | ||
| {% set col_name = featureview.name ~ '__' ~ featureview.field_mapping.get(feature, feature) %} | ||
| {% else %} | ||
| {% set col_name = featureview.field_mapping.get(feature, feature) %} | ||
| {% endif %} | ||
| {% if featureview.ttl != 0 %} | ||
| CASE WHEN (spine.event_timestamp - "{{ featureview.name }}__f"."{{ featureview.name }}__filled_ts") <= make_interval(secs => {{ featureview.ttl }}) | ||
| THEN "{{ featureview.name }}__f"."{{ col_name }}" ELSE NULL END AS "{{ col_name }}"{% if feat_idx.count < total_features %},{% endif %} | ||
| {% else %} | ||
| "{{ featureview.name }}__f"."{{ col_name }}"{% if feat_idx.count < total_features %},{% endif %} | ||
| {% endif %} | ||
| {% endfor %} | ||
| {% endfor %} | ||
| FROM base_entities base | ||
| FROM spine | ||
| {% for featureview in featureviews %} | ||
| {% set outer_loop_index = loop.index0 %} | ||
| LEFT JOIN LATERAL ( | ||
| SELECT DISTINCT ON ({% for entity in featureview.entities %}"{{ entity }}"{% if not loop.last %}, {% endif %}{% endfor %}) | ||
| event_timestamp, | ||
| {% for entity in featureview.entities %} | ||
| "{{ entity }}", | ||
| {% endfor %} | ||
| {% for feature in featureview.features %} | ||
| "{% if full_feature_names %}{{ featureview.name }}__{{ featureview.field_mapping.get(feature, feature) }}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}"{% if not loop.last %},{% endif %} | ||
| {% endfor %} | ||
| FROM "{{ featureview.name }}__data" fv_sub_{{ outer_loop_index }} | ||
| WHERE fv_sub_{{ outer_loop_index }}.event_timestamp <= base.event_timestamp | ||
| {% if featureview.ttl != 0 %} | ||
| AND fv_sub_{{ outer_loop_index }}.event_timestamp >= base.event_timestamp - {{ featureview.ttl }} * interval '1' second | ||
| {% endif %} | ||
| {% for entity in featureview.entities %} | ||
| AND fv_sub_{{ outer_loop_index }}."{{ entity }}" = base."{{ entity }}" | ||
| LEFT JOIN "{{ featureview.name }}__filled" AS "{{ featureview.name }}__f" | ||
| ON spine.event_timestamp = "{{ featureview.name }}__f".event_timestamp | ||
| {% for entity in all_entities %} | ||
| AND spine."{{ entity }}" IS NOT DISTINCT FROM "{{ featureview.name }}__f"."{{ entity }}" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good use of |
||
| {% endfor %} | ||
| ORDER BY {% for entity in featureview.entities %}"{{ entity }}"{% if not loop.last %}, {% endif %}{% endfor %}, event_timestamp DESC | ||
| ) AS fv_{{ outer_loop_index }} ON true | ||
| AND "{{ featureview.name }}__f".is_spine = 1 | ||
| {% endfor %} | ||
| ORDER BY base.event_timestamp | ||
| ORDER BY spine.event_timestamp | ||
| {% endif %} | ||
| {% else %} | ||
| WITH | ||
|
|
@@ -699,7 +800,7 @@ def _get_entity_schema( | |
| FROM {{ featureview.table_subquery }} AS sub | ||
| WHERE "{{ featureview.timestamp_field }}" <= (SELECT MAX(entity_timestamp) FROM entity_dataframe) | ||
| {% if featureview.ttl == 0 %}{% else %} | ||
| AND "{{ featureview.timestamp_field }}" >= (SELECT MIN(entity_timestamp) FROM entity_dataframe) - {{ featureview.ttl }} * interval '1' second | ||
| AND "{{ featureview.timestamp_field }}" >= (SELECT MIN(entity_timestamp) FROM entity_dataframe) - make_interval(secs => {{ featureview.ttl }}) | ||
| {% endif %} | ||
| ), | ||
|
|
||
|
|
@@ -714,7 +815,7 @@ def _get_entity_schema( | |
| AND subquery.event_timestamp <= entity_dataframe.entity_timestamp | ||
|
|
||
| {% if featureview.ttl == 0 %}{% else %} | ||
| AND subquery.event_timestamp >= entity_dataframe.entity_timestamp - {{ featureview.ttl }} * interval '1' second | ||
| AND subquery.event_timestamp >= entity_dataframe.entity_timestamp - make_interval(secs => {{ featureview.ttl }}) | ||
| {% endif %} | ||
|
|
||
| {% for entity in featureview.entities %} | ||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: When there's no
created_timestamp_column, this CTE is a no-op passthrough (SELECT * FROM __data_raw). Postgres will likely optimize it away, but you could simplify by using__data_rawas the alias directly:Then reference
{{ featureview.name }}__data{% if not featureview.created_timestamp_column %}_raw{% endif %}downstream. Though I understand the current approach is simpler to maintain.