-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: Enable Vector database and retrieve_online_documents API #4061
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
713768e
58d5d94
2cd73d1
d2e0a59
7079e7f
8c9ee97
513dd39
29d98cd
11eb97f
865baf2
47cd117
3f9f59f
7935071
ba39f93
cf53c71
92046af
d0acd2d
cc45f73
006b5c6
6e0ba03
a2302be
2e6fc55
3cbbf21
ec32764
e2d8008
523d20f
5cd085d
795699e
67b007f
33b46bd
82fe5f1
0618378
7de2016
92fed1d
d4f2639
396d7de
6c38b92
f763dc9
818c055
a51b555
2624b22
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1690,6 +1690,72 @@ def _get_online_features( | |
| ) | ||
| return OnlineResponse(online_features_response) | ||
|
|
||
| @log_exceptions_and_usage | ||
| def retrieve_online_documents( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's probably something to be said about having a configurable distance metric to let the user choose which way to get the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, there are a bunch of different algorithms/configs for Postgresql to retrieve the documents. We can support it in the future after this PR |
||
| self, | ||
| feature: str, | ||
| query: Union[str, List[float]], | ||
| top_k: int, | ||
| ) -> OnlineResponse: | ||
| """ | ||
| Retrieves the top k closest document features. Note, embeddings are a subset of features. | ||
|
|
||
| Args: | ||
| feature: The list of document features that should be retrieved from the online document store. These features can be | ||
| specified either as a list of string document feature references or as a feature service. String feature | ||
| references must have format "feature_view:feature", e.g, "document_fv:document_embeddings". | ||
| query: The query to retrieve the closest document features for. | ||
| top_k: The number of closest document features to retrieve. | ||
| """ | ||
| return self._retrieve_online_documents( | ||
| feature=feature, | ||
| query=query, | ||
| top_k=top_k, | ||
| ) | ||
|
|
||
| def _retrieve_online_documents( | ||
| self, | ||
| feature: str, | ||
| query: Union[str, List[float]], | ||
| top_k: int, | ||
| ): | ||
| if isinstance(query, str): | ||
| raise ValueError( | ||
| "Using embedding functionality is not supported for document retrieval. Please embed the query before calling retrieve_online_documents." | ||
| ) | ||
| ( | ||
| requested_feature_views, | ||
| _, | ||
| ) = self._get_feature_views_to_use( | ||
| features=[feature], allow_cache=True, hide_dummy_entity=False | ||
| ) | ||
| requested_feature = ( | ||
| feature.split(":")[1] if isinstance(feature, str) else feature | ||
| ) | ||
| provider = self._get_provider() | ||
| document_features = self._retrieve_from_online_store( | ||
| provider, | ||
| requested_feature_views[0], | ||
| requested_feature, | ||
| query, | ||
| top_k, | ||
| ) | ||
| document_feature_vals = [feature[2] for feature in document_features] | ||
| document_feature_distance_vals = [feature[3] for feature in document_features] | ||
| online_features_response = GetOnlineFeaturesResponse(results=[]) | ||
|
|
||
| # TODO Refactor to better way of populating result | ||
| # TODO populate entity in the response after returning entity in document_features is supported | ||
| self._populate_result_rows_from_columnar( | ||
| online_features_response=online_features_response, | ||
| data={requested_feature: document_feature_vals}, | ||
| ) | ||
| self._populate_result_rows_from_columnar( | ||
| online_features_response=online_features_response, | ||
| data={"distance": document_feature_distance_vals}, | ||
| ) | ||
| return OnlineResponse(online_features_response) | ||
|
|
||
| @staticmethod | ||
| def _get_columnar_entity_values( | ||
| rowise: Optional[List[Dict[str, Any]]], columnar: Optional[Dict[str, List[Any]]] | ||
|
|
@@ -1906,6 +1972,43 @@ def _read_from_online_store( | |
| read_row_protos.append((event_timestamps, statuses, values)) | ||
| return read_row_protos | ||
|
|
||
| def _retrieve_from_online_store( | ||
| self, | ||
| provider: Provider, | ||
| table: FeatureView, | ||
| requested_feature: str, | ||
| query: List[float], | ||
| top_k: int, | ||
| ) -> List[Tuple[Timestamp, "FieldStatus.ValueType", Value, Value]]: | ||
| """ | ||
| Search and return document features from the online document store. | ||
| """ | ||
| documents = provider.retrieve_online_documents( | ||
| config=self.config, | ||
| table=table, | ||
| requested_feature=requested_feature, | ||
| query=query, | ||
| top_k=top_k, | ||
| ) | ||
|
|
||
| read_row_protos = [] | ||
| row_ts_proto = Timestamp() | ||
|
|
||
| for row_ts, feature_val, distance_val in documents: | ||
| # Reset timestamp to default or update if row_ts is not None | ||
| if row_ts is not None: | ||
| row_ts_proto.FromDatetime(row_ts) | ||
|
|
||
| if feature_val is None or distance_val is None: | ||
| feature_val = Value() | ||
| distance_val = Value() | ||
| status = FieldStatus.NOT_FOUND | ||
| else: | ||
| status = FieldStatus.PRESENT | ||
|
|
||
| read_row_protos.append((row_ts_proto, status, feature_val, distance_val)) | ||
| return read_row_protos | ||
|
|
||
| @staticmethod | ||
| def _populate_response_from_feature_data( | ||
| feature_data: Iterable[ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,18 @@ | ||
| from feast.infra.offline_stores.contrib.postgres_offline_store.tests.data_source import ( | ||
| PostgreSQLDataSourceCreator, | ||
| ) | ||
| from tests.integration.feature_repos.integration_test_repo_config import ( | ||
| IntegrationTestRepoConfig, | ||
| ) | ||
| from tests.integration.feature_repos.universal.online_store.postgres import ( | ||
| PGVectorOnlineStoreCreator, | ||
| PostgresOnlineStoreCreator, | ||
| ) | ||
|
|
||
| FULL_REPO_CONFIGS = [ | ||
| IntegrationTestRepoConfig(online_store_creator=PostgreSQLDataSourceCreator), | ||
| IntegrationTestRepoConfig( | ||
| online_store="postgres", online_store_creator=PostgresOnlineStoreCreator | ||
| ), | ||
| IntegrationTestRepoConfig( | ||
| online_store="pgvector", online_store_creator=PGVectorOnlineStoreCreator | ||
| ), | ||
| ] | ||
|
|
||
| AVAILABLE_ONLINE_STORES = {"pgvector": PGVectorOnlineStoreCreator} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at this now, was this the right choice?