Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
33f5d7f
feat: Adding Docling RAG demo
franciscojavierarceo Mar 1, 2025
292f2c9
updated demo
franciscojavierarceo Mar 2, 2025
38f8c23
cleaned up notebook
franciscojavierarceo Mar 2, 2025
fea2a5e
adding chunk id
franciscojavierarceo Mar 2, 2025
a33545b
adding quickstart demo that is WIP and updating docling-demo to expor…
franciscojavierarceo Mar 2, 2025
8c7dc00
adding current tentative exmaple repo
franciscojavierarceo Mar 2, 2025
d1f4269
adding current temporary work
franciscojavierarceo Mar 2, 2025
6609cc3
updating demo script to rename things
franciscojavierarceo Mar 3, 2025
48f77e4
updated quickstart
franciscojavierarceo Mar 3, 2025
26919a3
added comment
franciscojavierarceo Mar 3, 2025
fb1bce4
checking in progress
franciscojavierarceo Mar 6, 2025
4340ca6
checking in progress for now, still have some issues with vector retr…
franciscojavierarceo Mar 7, 2025
0564f7d
okay think i have most things working
franciscojavierarceo Mar 8, 2025
637df02
removing commenting and unnecessary code
franciscojavierarceo Mar 9, 2025
31e7f85
updated type mapping for PDFs
franciscojavierarceo Mar 14, 2025
7978df9
updated test case
franciscojavierarceo Mar 14, 2025
b249b87
updated unit test
franciscojavierarceo Mar 17, 2025
fdedcbe
linter
franciscojavierarceo Mar 17, 2025
3a8fa01
limiting test run
franciscojavierarceo Mar 17, 2025
b8de03a
missed import
franciscojavierarceo Mar 17, 2025
f83153c
uploading demo
franciscojavierarceo Mar 17, 2025
1fbfdb1
only running on mac 13
franciscojavierarceo Mar 17, 2025
3c85653
updated implementation to work with docling
franciscojavierarceo Mar 20, 2025
7b1f059
remove print statement
franciscojavierarceo Mar 20, 2025
b99530a
using get_online_features instead
franciscojavierarceo Mar 20, 2025
09c73a7
demo working
franciscojavierarceo Mar 20, 2025
c416bcc
removing files from other PR
franciscojavierarceo Mar 21, 2025
def595e
removing files from other PR
franciscojavierarceo Mar 21, 2025
8379a97
reverting print
franciscojavierarceo Mar 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions sdk/python/feast/feature_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -1566,16 +1566,34 @@ def _get_feature_view_and_df_for_online_write(
if feature_view.singleton
else df.to_dict(orient="list")
)
transformed_data = feature_view.feature_transformation.udf(input_dict)
if feature_view.singleton:
transformed_data = df.apply(
feature_view.feature_transformation.udf, axis=1
)
transformed_data = pd.DataFrame(
transformed_data.to_list()
).applymap(
lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x
)
else:
transformed_data = feature_view.feature_transformation.udf(
input_dict
)
if feature_view.write_to_online_store:
entities = [
self.get_entity(entity)
for entity in (feature_view.entities or [])
]
join_keys = [entity.join_key for entity in entities if entity]
join_keys = [k for k in join_keys if k in input_dict.keys()]
transformed_df = pd.DataFrame(transformed_data)
input_df = pd.DataFrame(input_dict)
transformed_df = (
pd.DataFrame(transformed_data)
if not isinstance(transformed_data, pd.DataFrame)
else transformed_data
)
input_df = pd.DataFrame(
[input_dict] if feature_view.singleton else input_dict
)
if input_df.shape[0] == transformed_df.shape[0]:
for k in input_dict:
if k not in transformed_data:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,10 @@ def online_read(
feature_name_feast_primitive_type_map = {
f.name: f.dtype for f in table.features
}
if getattr(table, "write_to_online_store", False):
feature_name_feast_primitive_type_map.update(
{f.name: f.dtype for f in table.schema}
)
# Build a dictionary mapping composite key -> (res_ts, res)
results_dict: Dict[
str, Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]
Expand Down Expand Up @@ -394,6 +398,7 @@ def online_read(
"int64_val",
"float_val",
"double_val",
"string_val",
]:
setattr(
val,
Expand All @@ -420,7 +425,7 @@ def online_read(
setattr(val, proto_attr, field_value)
else:
raise ValueError(
f"Unsupported ValueType: {feature_feast_primitive_type} with feature view value {field_value} for feature {field} with value {field_value}"
f"Unsupported ValueType: {feature_feast_primitive_type} with feature view value {field_value} for feature {field} with value type {proto_attr}"
)
# res[field] = val
key_to_use = field.split(":", 1)[-1] if ":" in field else field
Expand Down
3 changes: 3 additions & 0 deletions sdk/python/feast/on_demand_feature_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,9 @@ def _construct_random_input(
) -> dict[str, Union[list[Any], Any]]:
rand_dict_value: dict[ValueType, Union[list[Any], Any]] = {
ValueType.BYTES: [str.encode("hello world")],
ValueType.PDF_BYTES: [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to have a new type, maybe just use BYTES?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, running type inference on raw bytes will fail when using a transformation expecting PDF content.

b"%PDF-1.3\n3 0 obj\n<</Type /Page\n/Parent 1 0 R\n/Resources 2 0 R\n/Contents 4 0 R>>\nendobj\n4 0 obj\n<</Filter /FlateDecode /Length 115>>\nstream\nx\x9c\x15\xcc1\x0e\x820\x18@\xe1\x9dS\xbcM]jk$\xd5\xd5(\x83!\x86\xa1\x17\xf8\xa3\xa5`LIh+\xd7W\xc6\xf7\r\xef\xc0\xbd\xd2\xaa\xb6,\xd5\xc5\xb1o\x0c\xa6VZ\xe3znn%\xf3o\xab\xb1\xe7\xa3:Y\xdc\x8bm\xeb\xf3&1\xc8\xd7\xd3\x97\xc82\xe6\x81\x87\xe42\xcb\x87Vb(\x12<\xdd<=}Jc\x0cL\x91\xee\xda$\xb5\xc3\xbd\xd7\xe9\x0f\x8d\x97 $\nendstream\nendobj\n1 0 obj\n<</Type /Pages\n/Kids [3 0 R ]\n/Count 1\n/MediaBox [0 0 595.28 841.89]\n>>\nendobj\n5 0 obj\n<</Type /Font\n/BaseFont /Helvetica\n/Subtype /Type1\n/Encoding /WinAnsiEncoding\n>>\nendobj\n2 0 obj\n<<\n/ProcSet [/PDF /Text /ImageB /ImageC /ImageI]\n/Font <<\n/F1 5 0 R\n>>\n/XObject <<\n>>\n>>\nendobj\n6 0 obj\n<<\n/Producer (PyFPDF 1.7.2 http://pyfpdf.googlecode.com/)\n/Title (This is a sample title.)\n/Author (Francisco Javier Arceo)\n/CreationDate (D:20250312165548)\n>>\nendobj\n7 0 obj\n<<\n/Type /Catalog\n/Pages 1 0 R\n/OpenAction [3 0 R /FitH null]\n/PageLayout /OneColumn\n>>\nendobj\nxref\n0 8\n0000000000 65535 f \n0000000272 00000 n \n0000000455 00000 n \n0000000009 00000 n \n0000000087 00000 n \n0000000359 00000 n \n0000000559 00000 n \n0000000734 00000 n \ntrailer\n<<\n/Size 8\n/Root 7 0 R\n/Info 6 0 R\n>>\nstartxref\n837\n%%EOF\n"
],
ValueType.STRING: ["hello world"],
ValueType.INT32: [1],
ValueType.INT64: [1],
Expand Down
16 changes: 12 additions & 4 deletions sdk/python/feast/transformation/python_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,19 @@ def infer_features(
f"Failed to infer type for feature '{feature_name}' with value "
+ f"'{feature_value}' since no items were returned by the UDF."
)
inferred_type = type(feature_value[0])
inferred_value = feature_value[0]
if singleton:
inferred_value = feature_value
inferred_type = None # type: ignore
if singleton and isinstance(inferred_value, list):
# If we have a nested list like [[0.5, 0.5, ...]]
if len(inferred_value) > 0:
# Get the actual element type from the inner list
inferred_type = type(inferred_value[0])
else:
raise TypeError(
f"Failed to infer type for nested feature '{feature_name}' - inner list is empty"
)
else:
# For non-nested lists or when singleton is False
inferred_type = type(inferred_value)

else:
inferred_type = type(feature_value)
Expand Down
6 changes: 6 additions & 0 deletions sdk/python/feast/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
PRIMITIVE_FEAST_TYPES_TO_VALUE_TYPES = {
"INVALID": "UNKNOWN",
"BYTES": "BYTES",
"PDF_BYTES": "PDF_BYTES",
"STRING": "STRING",
"INT32": "INT32",
"INT64": "INT64",
Expand Down Expand Up @@ -79,6 +80,7 @@ class PrimitiveFeastType(Enum):
FLOAT32 = 6
BOOL = 7
UNIX_TIMESTAMP = 8
PDF_BYTES = 9

def to_value_type(self) -> ValueType:
"""
Expand All @@ -102,6 +104,7 @@ def __hash__(self):

Invalid = PrimitiveFeastType.INVALID
Bytes = PrimitiveFeastType.BYTES
PdfBytes = PrimitiveFeastType.PDF_BYTES
String = PrimitiveFeastType.STRING
Bool = PrimitiveFeastType.BOOL
Int32 = PrimitiveFeastType.INT32
Expand All @@ -114,6 +117,7 @@ def __hash__(self):
Invalid,
String,
Bytes,
PdfBytes,
Bool,
Int32,
Int64,
Expand All @@ -126,6 +130,7 @@ def __hash__(self):
"INVALID": "Invalid",
"STRING": "String",
"BYTES": "Bytes",
"PDF_BYTES": "PdfBytes",
"BOOL": "Bool",
"INT32": "Int32",
"INT64": "Int64",
Expand Down Expand Up @@ -168,6 +173,7 @@ def __str__(self):
VALUE_TYPES_TO_FEAST_TYPES: Dict["ValueType", FeastType] = {
ValueType.UNKNOWN: Invalid,
ValueType.BYTES: Bytes,
ValueType.PDF_BYTES: PdfBytes,
ValueType.STRING: String,
ValueType.INT32: Int32,
ValueType.INT64: Int64,
Expand Down
1 change: 1 addition & 0 deletions sdk/python/feast/value_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class ValueType(enum.Enum):
BOOL_LIST = 17
UNIX_TIMESTAMP_LIST = 18
NULL = 19
PDF_BYTES = 20


ListType = Union[
Expand Down
196 changes: 195 additions & 1 deletion sdk/python/tests/unit/online_store/test_online_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sqlite3
import sys
import time
from typing import Any

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -1056,7 +1057,7 @@ def test_local_milvus() -> None:
client.drop_collection(collection_name=COLLECTION_NAME)


def test_milvus_lite_get_online_documents_v2() -> None:
def test_milvus_lite_retrieve_online_documents_v2() -> None:
"""
Test retrieving documents from the online store in local mode.
"""
Expand Down Expand Up @@ -1226,6 +1227,199 @@ def test_milvus_lite_get_online_documents_v2() -> None:
assert len(result["distance"]) == len(results[0])


def test_milvus_stored_writes_with_explode() -> None:
"""
Test storing and retrieving exploded document embeddings with Milvus online store.
"""
from feast import (
Entity,
RequestSource,
)
from feast.field import Field
from feast.on_demand_feature_view import on_demand_feature_view
from feast.types import (
Array,
Bytes,
Float32,
String,
ValueType,
)

random.seed(42)
vector_length = 10
runner = CliRunner()
with runner.local_repo(
example_repo_py=get_example_repo("example_rag_feature_repo.py"),
offline_store="file",
online_store="milvus",
apply=False,
teardown=False,
) as store:
# Define entities and sources
chunk = Entity(
name="chunk", join_keys=["chunk_id"], value_type=ValueType.STRING
)
document = Entity(
name="document", join_keys=["document_id"], value_type=ValueType.STRING
)

input_explode_request_source = RequestSource(
name="document_source",
schema=[
Field(name="document_id", dtype=String),
Field(name="document_text", dtype=String),
Field(name="document_bytes", dtype=Bytes),
],
)

@on_demand_feature_view(
entities=[chunk, document],
sources=[input_explode_request_source],
schema=[
Field(name="document_id", dtype=String),
Field(name="chunk_id", dtype=String),
Field(name="chunk_text", dtype=String),
Field(
name="vector",
dtype=Array(Float32),
vector_index=True,
vector_search_metric="COSINE", # Use COSINE like in Milvus test
),
],
mode="python",
write_to_online_store=True,
)
def milvus_explode_feature_view(inputs: dict[str, Any]):
output: dict[str, Any] = {
"document_id": ["doc_1", "doc_1", "doc_2", "doc_2"],
"chunk_id": ["chunk-1", "chunk-2", "chunk-1", "chunk-2"],
"chunk_text": [
"hello friends",
"how are you?",
"This is a test.",
"Document chunking example.",
],
"vector": [
[0.1] * vector_length,
[0.2] * vector_length,
[0.3] * vector_length,
[0.4] * vector_length,
],
}
return output

# Apply the feature store configuration
store.apply(
[
chunk,
document,
input_explode_request_source,
milvus_explode_feature_view,
]
)

# Verify feature view registration
odfv_applied = store.get_on_demand_feature_view("milvus_explode_feature_view")
assert odfv_applied.features[1].vector_index
assert odfv_applied.entities == [chunk.name, document.name]
assert odfv_applied.entity_columns[0].name == document.join_key
assert odfv_applied.entity_columns[1].name == chunk.join_key

# Write to online store
odfv_entity_rows_to_write = [
{
"document_id": "document_1",
"document_text": "Hello world. How are you?",
},
{
"document_id": "document_2",
"document_text": "This is a test. Document chunking example.",
},
]
store.write_to_online_store(
feature_view_name="milvus_explode_feature_view",
df=odfv_entity_rows_to_write,
)

# Verify feature retrieval
fv_entity_rows_to_read = [
{
"document_id": "doc_1",
"chunk_id": "chunk-2",
},
{
"document_id": "doc_2",
"chunk_id": "chunk-1",
},
]

online_response = store.get_online_features(
entity_rows=fv_entity_rows_to_read,
features=[
"milvus_explode_feature_view:document_id",
"milvus_explode_feature_view:chunk_id",
"milvus_explode_feature_view:chunk_text",
],
).to_dict()

assert sorted(list(online_response.keys())) == sorted(
[
"chunk_id",
"chunk_text",
"document_id",
]
)

# Test vector search using Milvus
query_embedding = [0.1] * vector_length

# First get Milvus client and search directly
client = store._provider._online_store.client
collection_name = client.list_collections()[0]
search_params = {
"metric_type": "COSINE",
"params": {"nprobe": 10},
}

direct_results = client.search(
collection_name=collection_name,
data=[query_embedding],
anns_field="vector",
search_params=search_params,
limit=2,
output_fields=["document_id", "chunk_id", "chunk_text"],
)

# Then use the Feast API
feast_results = store.retrieve_online_documents_v2(
features=[
"milvus_explode_feature_view:document_id",
"milvus_explode_feature_view:chunk_id",
"milvus_explode_feature_view:chunk_text",
],
query=query_embedding,
top_k=2,
).to_dict()

# Validate vector search results
assert "document_id" in feast_results
assert "chunk_id" in feast_results
assert "chunk_text" in feast_results
assert "distance" in feast_results
assert len(feast_results["distance"]) == 2
assert len(feast_results["document_id"]) == 2
assert (
len(direct_results[0]) == 2
) # Verify both approaches return same number of results
del feast_results["distance"]

assert feast_results == {
"document_id": ["doc_2", "doc_1"],
"chunk_id": ["chunk-1", "chunk-2"],
"chunk_text": ["This is a test.", "how are you?"],
}


def test_milvus_native_from_feast_data() -> None:
import random
from datetime import datetime
Expand Down
Loading
Loading