Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e38a7c0
feat: Adding Docling RAG demo
franciscojavierarceo Mar 1, 2025
f9c7db4
updated demo
franciscojavierarceo Mar 2, 2025
4ba9fce
cleaned up notebook
franciscojavierarceo Mar 2, 2025
6c3ea35
adding chunk id
franciscojavierarceo Mar 2, 2025
410ded0
adding quickstart demo that is WIP and updating docling-demo to expor…
franciscojavierarceo Mar 2, 2025
8f0c663
adding current tentative exmaple repo
franciscojavierarceo Mar 2, 2025
f098674
adding current temporary work
franciscojavierarceo Mar 2, 2025
99dc5f3
updating to handle missed edge case
franciscojavierarceo Mar 3, 2025
29ca46f
linter
franciscojavierarceo Mar 3, 2025
3f23d25
updating demo script to rename things
franciscojavierarceo Mar 3, 2025
d366988
updated quickstart
franciscojavierarceo Mar 3, 2025
f917183
added comment
franciscojavierarceo Mar 3, 2025
e8b9e4a
checking in progress
franciscojavierarceo Mar 6, 2025
521abb5
updating with progress...found large bug
franciscojavierarceo Mar 6, 2025
731d6f1
almost have something working...very close
franciscojavierarceo Mar 7, 2025
f0d396d
have writes behaving, now need reads
franciscojavierarceo Mar 7, 2025
86b09c5
fixed writes and reads and unit test
franciscojavierarceo Mar 7, 2025
cbcbe70
got write test working (not explode though)
franciscojavierarceo Mar 7, 2025
48fba5f
checking in progress for now, still have some issues with vector retr…
franciscojavierarceo Mar 7, 2025
fef8666
okay think i have most things working
franciscojavierarceo Mar 8, 2025
8626762
unit tests are passing...still need to lint but checking in for tonight
franciscojavierarceo Mar 8, 2025
3cd0093
fixed unit test and linter...i believe
franciscojavierarceo Mar 9, 2025
ffac095
removing commenting and unnecessary code
franciscojavierarceo Mar 9, 2025
2e9d2d5
forgot to add file for unit tests
franciscojavierarceo Mar 9, 2025
94c60d6
skipping test for some
franciscojavierarceo Mar 9, 2025
fabd394
removing rag docling demo from this branch
franciscojavierarceo Mar 9, 2025
e545e4b
addding skip test back in for milvus
franciscojavierarceo Mar 9, 2025
548e0ce
removed print statement
franciscojavierarceo Mar 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 69 additions & 2 deletions sdk/python/feast/feature_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@
from feast.saved_dataset import SavedDataset, SavedDatasetStorage, ValidationReference
from feast.ssl_ca_trust_store_setup import configure_ca_trust_store_env_variables
from feast.stream_feature_view import StreamFeatureView
from feast.transformation.pandas_transformation import PandasTransformation
from feast.transformation.python_transformation import PythonTransformation
from feast.utils import _utc_now

warnings.simplefilter("once", DeprecationWarning)
Expand Down Expand Up @@ -1546,6 +1548,64 @@ def _get_feature_view_and_df_for_online_write(
df = pd.DataFrame(df)
except Exception as _:
raise DataFrameSerializationError(df)

# # Apply transformations if this is an OnDemandFeatureView with write_to_online_store=True
if (
isinstance(feature_view, OnDemandFeatureView)
and feature_view.write_to_online_store
):
if (
feature_view.mode == "python"
and isinstance(
feature_view.feature_transformation, PythonTransformation
)
and df is not None
):
input_dict = (
df.to_dict(orient="records")[0]
if feature_view.singleton
else df.to_dict(orient="list")
)
transformed_data = feature_view.feature_transformation.udf(input_dict)
if feature_view.write_to_online_store:
entities = [
self.get_entity(entity)
for entity in (feature_view.entities or [])
]
join_keys = [entity.join_key for entity in entities if entity]
join_keys = [k for k in join_keys if k in input_dict.keys()]
transformed_df = pd.DataFrame(transformed_data)
input_df = pd.DataFrame(input_dict)
if input_df.shape[0] == transformed_df.shape[0]:
for k in input_dict:
if k not in transformed_data:
transformed_data[k] = input_dict[k]
transformed_df = pd.DataFrame(transformed_data)
else:
transformed_df = pd.merge(
transformed_df,
input_df,
how="left",
on=join_keys,
)
else:
# overwrite any transformed features and update the dictionary
for k in input_dict:
if k not in transformed_data:
transformed_data[k] = input_dict[k]
df = pd.DataFrame(transformed_data)
elif feature_view.mode == "pandas" and isinstance(
feature_view.feature_transformation, PandasTransformation
):
transformed_df = feature_view.feature_transformation.udf(df)
if df is not None:
for col in df.columns:
transformed_df[col] = df[col]
df = transformed_df

else:
raise Exception("Unsupported OnDemandFeatureView mode")

return feature_view, df

def write_to_online_store(
Expand Down Expand Up @@ -1887,7 +1947,7 @@ def retrieve_online_documents_v2(

(
available_feature_views,
_,
available_odfv_views,
) = utils._get_feature_views_to_use(
registry=self._registry,
project=self.project,
Expand All @@ -1898,13 +1958,20 @@ def retrieve_online_documents_v2(
feature_view_set = set()
for feature in features:
feature_view_name = feature.split(":")[0]
feature_view = self.get_feature_view(feature_view_name)
if feature_view_name in [fv.name for fv in available_odfv_views]:
feature_view: Union[OnDemandFeatureView, FeatureView] = (
self.get_on_demand_feature_view(feature_view_name)
)
else:
feature_view = self.get_feature_view(feature_view_name)
feature_view_set.add(feature_view.name)
if len(feature_view_set) > 1:
raise ValueError("Document retrieval only supports a single feature view.")
requested_features = [
f.split(":")[1] for f in features if isinstance(f, str) and ":" in f
]
if len(available_feature_views) == 0:
available_feature_views.extend(available_odfv_views) # type: ignore[arg-type]

requested_feature_view = available_feature_views[0]
if not requested_feature_view:
Expand Down
3 changes: 1 addition & 2 deletions sdk/python/feast/feature_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,12 +348,11 @@ def to_proto(self) -> FeatureViewProto:
if self.stream_source:
stream_source_proto = self.stream_source.to_proto()
stream_source_proto.data_source_class_type = f"{self.stream_source.__class__.__module__}.{self.stream_source.__class__.__name__}"

spec = FeatureViewSpecProto(
name=self.name,
entities=self.entities,
entity_columns=[field.to_proto() for field in self.entity_columns],
features=[field.to_proto() for field in self.features],
features=[feature.to_proto() for feature in self.features],
description=self.description,
tags=self.tags,
owner=self.owner,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,14 @@ def _get_or_create_collection(
)
index_params = self.client.prepare_index_params()
for vector_field in schema.fields:
if vector_field.dtype in [
DataType.FLOAT_VECTOR,
DataType.BINARY_VECTOR,
]:
if (
vector_field.dtype
in [
DataType.FLOAT_VECTOR,
DataType.BINARY_VECTOR,
]
and vector_field.name in vector_field_dict
):
metric = vector_field_dict[
vector_field.name
].vector_search_metric
Expand Down
21 changes: 12 additions & 9 deletions sdk/python/feast/infra/online_stores/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,10 @@ def online_write_batch(
table_name = _table_id(project, table)
for feature_name, val in values.items():
if config.online_store.vector_enabled:
if feature_type_dict[feature_name] in FEAST_VECTOR_TYPES:
if (
feature_type_dict.get(feature_name, None)
in FEAST_VECTOR_TYPES
):
val_bin = serialize_f32(
val.float_list_val.val, config.online_store.vector_len
) # type: ignore
Expand Down Expand Up @@ -226,22 +229,22 @@ def online_read(

result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = []

serialized_entity_keys = [
serialize_entity_key(
entity_key,
entity_key_serialization_version=config.entity_key_serialization_version,
)
for entity_key in entity_keys
]
# Fetch all entities in one go
cur.execute(
f"SELECT entity_key, feature_name, value, event_ts "
f"FROM {_table_id(config.project, table)} "
f"WHERE entity_key IN ({','.join('?' * len(entity_keys))}) "
f"ORDER BY entity_key",
[
serialize_entity_key(
entity_key,
entity_key_serialization_version=config.entity_key_serialization_version,
)
for entity_key in entity_keys
],
serialized_entity_keys,
)
rows = cur.fetchall()

rows = {
k: list(group) for k, group in itertools.groupby(rows, key=lambda r: r[0])
}
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/feast/infra/passthrough_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def materialize_single_feature_view(
def get_historical_features(
self,
config: RepoConfig,
feature_views: List[FeatureView],
feature_views: List[Union[FeatureView, OnDemandFeatureView]],
feature_refs: List[str],
entity_df: Union[pd.DataFrame, str],
registry: BaseRegistry,
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/feast/infra/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def materialize_single_feature_view(
def get_historical_features(
self,
config: RepoConfig,
feature_views: List[FeatureView],
feature_views: List[Union[FeatureView, OnDemandFeatureView]],
feature_refs: List[str],
entity_df: Union[pd.DataFrame, str],
registry: BaseRegistry,
Expand Down
67 changes: 67 additions & 0 deletions sdk/python/feast/nlp_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from datetime import datetime
from typing import Dict

import numpy as np
import pandas as pd


def create_document_chunks_df(
documents: Dict[str, str],
start_date: datetime,
end_date: datetime,
embedding_size: int = 60,
) -> pd.DataFrame:
"""
Example df generated by this function:

| event_timestamp | document_id | chunk_id | chunk_text | embedding | created |
|------------------+-------------+----------+------------------+-----------+------------------|
| 2021-03-17 19:31 | doc_1 | chunk-1 | Hello world | [0.1, ...]| 2021-03-24 19:34 |
| 2021-03-17 19:31 | doc_1 | chunk-2 | How are you? | [0.2, ...]| 2021-03-24 19:34 |
| 2021-03-17 19:31 | doc_2 | chunk-1 | This is a test | [0.3, ...]| 2021-03-24 19:34 |
| 2021-03-17 19:31 | doc_2 | chunk-2 | Document chunk | [0.4, ...]| 2021-03-24 19:34 |
"""
df_hourly = pd.DataFrame(
{
"event_timestamp": [
pd.Timestamp(dt, unit="ms").round("ms")
for dt in pd.date_range(
start=start_date,
end=end_date,
freq="1h",
inclusive="left",
tz="UTC",
)
]
+ [
pd.Timestamp(
year=2021, month=4, day=12, hour=7, minute=0, second=0, tz="UTC"
)
]
}
)
df_all_chunks = pd.DataFrame()

for doc_id, doc_text in documents.items():
chunks = doc_text.split(". ") # Simple chunking by sentence
for chunk_id, chunk_text in enumerate(chunks, start=1):
df_hourly_copy = df_hourly.copy()
df_hourly_copy["document_id"] = doc_id
df_hourly_copy["chunk_id"] = f"chunk-{chunk_id}"
df_hourly_copy["chunk_text"] = chunk_text
df_all_chunks = pd.concat([df_hourly_copy, df_all_chunks])

df_all_chunks.reset_index(drop=True, inplace=True)
rows = df_all_chunks["event_timestamp"].count()

# Generate random embeddings for each chunk
df_all_chunks["embedding"] = [
np.random.rand(embedding_size).tolist() for _ in range(rows)
]
df_all_chunks["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms"))

# Create duplicate rows that should be filtered by created timestamp
late_row = df_all_chunks[rows // 2 : rows // 2 + 1]
df_all_chunks = pd.concat([df_all_chunks, late_row, late_row], ignore_index=True)

return df_all_chunks
25 changes: 18 additions & 7 deletions sdk/python/feast/on_demand_feature_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,6 @@ def to_proto(self) -> OnDemandFeatureViewProto:
write_to_online_store=self.write_to_online_store,
singleton=self.singleton if self.singleton else False,
)

return OnDemandFeatureViewProto(spec=spec, meta=meta)

@classmethod
Expand Down Expand Up @@ -454,6 +453,8 @@ def from_proto(
Field(
name=feature.name,
dtype=from_value_type(ValueType(feature.value_type)),
vector_index=feature.vector_index,
vector_search_metric=feature.vector_search_metric,
)
for feature in on_demand_feature_view_proto.spec.features
],
Expand Down Expand Up @@ -640,13 +641,25 @@ def transform_dict(

def infer_features(self) -> None:
random_input = self._construct_random_input(singleton=self.singleton)
inferred_features = self.feature_transformation.infer_features(random_input)
inferred_features = self.feature_transformation.infer_features(
random_input=random_input, singleton=self.singleton
)

if self.features:
missing_features = []
for specified_feature in self.features:
if specified_feature not in inferred_features:
if (
specified_feature not in inferred_features
and "Array" not in specified_feature.dtype.__str__()
):
missing_features.append(specified_feature)
elif "Array" in specified_feature.dtype.__str__():
if specified_feature.name not in [
f.name for f in inferred_features
]:
missing_features.append(specified_feature)
else:
pass
if missing_features:
raise SpecifiedFeaturesNotPresentError(
missing_features, inferred_features, self.name
Expand Down Expand Up @@ -738,6 +751,7 @@ def on_demand_feature_view(
owner: str = "",
write_to_online_store: bool = False,
singleton: bool = False,
explode: bool = False,
):
"""
Creates an OnDemandFeatureView object with the given user function as udf.
Expand All @@ -759,6 +773,7 @@ def on_demand_feature_view(
the online store for faster retrieval.
singleton (optional): A boolean that indicates whether the transformation is executed on a singleton
(only applicable when mode="python").
explode (optional): A boolean that indicates whether the transformation explodes the input data into multiple rows.
"""

def mainify(obj) -> None:
Expand All @@ -778,10 +793,6 @@ def decorator(user_function):
)
transformation = PandasTransformation(user_function, udf_string)
elif mode == "python":
if return_annotation not in (inspect._empty, dict[str, Any]):
raise TypeError(
f"return signature for {user_function} is {return_annotation} but should be dict[str, Any]"
)
transformation = PythonTransformation(user_function, udf_string)
elif mode == "substrait":
from ibis.expr.types.relations import Table
Expand Down
6 changes: 4 additions & 2 deletions sdk/python/feast/transformation/pandas_transformation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable
from typing import Any, Callable, Optional

import dill
import pandas as pd
Expand Down Expand Up @@ -40,7 +40,9 @@ def transform_singleton(self, input_df: pd.DataFrame) -> pd.DataFrame:
"PandasTransformation does not support singleton transformations."
)

def infer_features(self, random_input: dict[str, list[Any]]) -> list[Field]:
def infer_features(
self, random_input: dict[str, list[Any]], singleton: Optional[bool]
) -> list[Field]:
df = pd.DataFrame.from_dict(random_input)
output_df: pd.DataFrame = self.transform(df)

Expand Down
12 changes: 9 additions & 3 deletions sdk/python/feast/transformation/python_transformation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from types import FunctionType
from typing import Any
from typing import Any, Optional

import dill
import pyarrow
Expand Down Expand Up @@ -45,7 +45,9 @@ def transform_singleton(self, input_dict: dict) -> dict:
output_dict = self.udf.__call__(input_dict)
return {**input_dict, **output_dict}

def infer_features(self, random_input: dict[str, Any]) -> list[Field]:
def infer_features(
self, random_input: dict[str, Any], singleton: Optional[bool] = False
) -> list[Field]:
output_dict: dict[str, Any] = self.transform(random_input)

fields = []
Expand All @@ -58,6 +60,10 @@ def infer_features(self, random_input: dict[str, Any]) -> list[Field]:
)
inferred_type = type(feature_value[0])
inferred_value = feature_value[0]
if singleton:
inferred_value = feature_value
inferred_type = None # type: ignore

else:
inferred_type = type(feature_value)
inferred_value = feature_value
Expand All @@ -69,7 +75,7 @@ def infer_features(self, random_input: dict[str, Any]) -> list[Field]:
python_type_to_feast_value_type(
feature_name,
value=inferred_value,
type_name=inferred_type.__name__,
type_name=inferred_type.__name__ if inferred_type else None,
)
),
)
Expand Down
Loading
Loading