Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 93 additions & 2 deletions docs/getting-started/genai.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ The transformation workflow typically involves:
3. **Chunking**: Split documents into smaller, semantically meaningful chunks
4. **Embedding Generation**: Convert text chunks into vector embeddings
5. **Storage**: Store embeddings and metadata in Feast's feature store

### Feature Transformation for LLMs

Feast supports transformations that can be used to:
Expand All @@ -66,6 +65,99 @@ Feast supports transformations that can be used to:
* Normalize and preprocess features before serving to LLMs
* Apply custom transformations to adapt features for specific LLM requirements

## Getting Started with Feast for GenAI

### Installation

To use Feast with vector database support, install with the appropriate extras:

```bash
# For Milvus support
pip install feast[milvus,nlp]

# For Elasticsearch support
pip install feast[elasticsearch]

# For Qdrant support
pip install feast[qdrant]

# For SQLite support (Python 3.10 only)
pip install feast[sqlite_vec]
```

### Configuration

Configure your feature store to use a vector database as the online store:

```yaml
project: genai-project
provider: local
registry: data/registry.db
online_store:
type: milvus
path: data/online_store.db
vector_enabled: true
embedding_dim: 384 # Adjust based on your embedding model
index_type: "IVF_FLAT"

offline_store:
type: file
entity_key_serialization_version: 3
```

### Defining Vector Features

Create feature views with vector index support:

```python
from feast import FeatureView, Field, Entity
from feast.types import Array, Float32, String

document = Entity(
name="document_id",
description="Document identifier",
join_keys=["document_id"],
)

document_embeddings = FeatureView(
name="document_embeddings",
entities=[document],
schema=[
Field(
name="vector",
dtype=Array(Float32),
vector_index=True, # Enable vector search
vector_search_metric="COSINE", # Similarity metric
),
Field(name="document_id", dtype=String),
Field(name="content", dtype=String),
],
source=document_source,
ttl=timedelta(days=30),
)
```

### Retrieving Similar Documents

Use the `retrieve_online_documents_v2` method to find similar documents:

```python
# Generate query embedding
query = "How does Feast support vector databases?"
query_embedding = embed_text(query) # Your embedding function

# Retrieve similar documents
context_data = store.retrieve_online_documents_v2(
features=[
"document_embeddings:vector",
"document_embeddings:document_id",
"document_embeddings:content",
],
query=query_embedding,
top_k=3,
distance_metric='COSINE',
).to_df()
```
## Use Cases

### Document Question-Answering
Expand Down Expand Up @@ -104,7 +196,6 @@ This integration enables:
- Generating embeddings for millions of text chunks
- Efficiently materializing features to vector databases
- Scaling RAG applications to enterprise-level document repositories

## Learn More

For more detailed information and examples:
Expand Down
96 changes: 96 additions & 0 deletions sdk/python/feast/document_labeling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json
from typing import Any, Dict, List, Optional

from feast.feature import Feature


class DocumentLabel:
def __init__(
self,
chunk_id: str,
document_id: str,
label: str,
confidence: Optional[float] = None,
metadata: Optional[Dict[str, Any]] = None,
):
self.chunk_id = chunk_id
self.document_id = document_id
self.label = label
self.confidence = confidence
self.metadata = metadata or {}

def to_dict(self) -> Dict[str, Any]:
return {
"chunk_id": self.chunk_id,
"document_id": self.document_id,
"label": self.label,
"confidence": self.confidence,
"metadata": self.metadata,
}

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "DocumentLabel":
return cls(
chunk_id=data["chunk_id"],
document_id=data["document_id"],
label=data["label"],
confidence=data.get("confidence"),
metadata=data.get("metadata", {}),
)


def store_document_label(feature: Feature, label: DocumentLabel) -> None:
if not hasattr(feature, "labels") or feature.labels is None:
if hasattr(feature, "_labels"):
feature._labels = {}
else:
return

labels_dict = feature.labels if hasattr(feature, "labels") else feature._labels
labels_key = "document_labels"
if labels_key not in labels_dict:
labels_dict[labels_key] = "[]"

existing_labels = json.loads(labels_dict[labels_key])
existing_labels.append(label.to_dict())
labels_dict[labels_key] = json.dumps(existing_labels)


def get_document_labels(feature: Feature) -> List[DocumentLabel]:
labels_dict = None
if hasattr(feature, "labels") and feature.labels:
labels_dict = feature.labels
elif hasattr(feature, "_labels") and feature._labels:
labels_dict = feature._labels

if not labels_dict or "document_labels" not in labels_dict:
return []

labels_data = json.loads(labels_dict["document_labels"])
return [DocumentLabel.from_dict(label_dict) for label_dict in labels_data]


def remove_document_label(feature: Feature, chunk_id: str, document_id: str) -> bool:
labels_dict = None
if hasattr(feature, "labels") and feature.labels:
labels_dict = feature.labels
elif hasattr(feature, "_labels") and feature._labels:
labels_dict = feature._labels

if not labels_dict or "document_labels" not in labels_dict:
return False

existing_labels = json.loads(labels_dict["document_labels"])
original_length = len(existing_labels)

filtered_labels = [
label
for label in existing_labels
if not (label["chunk_id"] == chunk_id and label["document_id"] == document_id)
]

if len(filtered_labels) < original_length:
labels_dict["document_labels"] = json.dumps(filtered_labels)
return True

return False
19 changes: 19 additions & 0 deletions sdk/python/feast/feature_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ class ChatRequest(BaseModel):
messages: List[ChatMessage]


class ReadDocumentRequest(BaseModel):
file_path: str


def _get_features(request: GetOnlineFeaturesRequest, store: "feast.FeatureStore"):
if request.feature_service:
feature_service = store.get_feature_service(
Expand Down Expand Up @@ -356,6 +360,21 @@ async def chat(request: ChatRequest):
# For now, just return dummy text
return {"response": "This is a dummy response from the Feast feature server."}

@app.post("/read-document")
async def read_document_endpoint(request: ReadDocumentRequest):
try:
import os

if not os.path.exists(request.file_path):
return {"error": f"File not found: {request.file_path}"}

with open(request.file_path, "r", encoding="utf-8") as file:
content = file.read()

return {"content": content, "file_path": request.file_path}
except Exception as e:
return {"error": str(e)}

@app.get("/chat")
async def chat_ui():
# Serve the chat UI
Expand Down
5 changes: 5 additions & 0 deletions ui/src/FeastUISansProviders.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import FeatureServiceInstance from "./pages/feature-services/FeatureServiceInsta
import DataSourceInstance from "./pages/data-sources/DataSourceInstance";
import RootProjectSelectionPage from "./pages/RootProjectSelectionPage";
import DatasetInstance from "./pages/saved-data-sets/DatasetInstance";
import DocumentLabelingPage from "./pages/document-labeling/DocumentLabelingPage";
import PermissionsIndex from "./pages/permissions/Index";
import LineageIndex from "./pages/lineage/Index";
import NoProjectGuard from "./components/NoProjectGuard";
Expand Down Expand Up @@ -145,6 +146,10 @@ const FeastUISansProvidersInner = ({
path="data-set/:datasetName/*"
element={<DatasetInstance />}
/>
<Route
path="document-labeling/"
element={<DocumentLabelingPage />}
/>
<Route path="permissions/" element={<PermissionsIndex />} />
<Route path="lineage/" element={<LineageIndex />} />
</Route>
Expand Down
1 change: 1 addition & 0 deletions ui/src/custom-tabs/TabsRegistryContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ export {
useDataSourceCustomTabs,
useEntityCustomTabs,
useDatasetCustomTabs,

// Routes
useRegularFeatureViewCustomTabRoutes,
useOnDemandFeatureViewCustomTabRoutes,
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
16 changes: 16 additions & 0 deletions ui/src/custom-tabs/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,20 @@ interface DatasetCustomTabRegistrationInterface
}: DatasetCustomTabProps) => JSX.Element;
}

// Type for Document Labeling Custom Tabs
interface DocumentLabelingCustomTabProps {
id: string | undefined;
feastObjectQuery: RegularFeatureViewQueryReturnType;
}
interface DocumentLabelingCustomTabRegistrationInterface
extends CustomTabRegistrationInterface {
Component: ({
id,
feastObjectQuery,
...args
}: DocumentLabelingCustomTabProps) => JSX.Element;
}

export type {
CustomTabRegistrationInterface,
RegularFeatureViewQueryReturnType,
Expand All @@ -157,4 +171,6 @@ export type {
FeatureCustomTabProps,
DatasetCustomTabRegistrationInterface,
DatasetCustomTabProps,
DocumentLabelingCustomTabRegistrationInterface,
DocumentLabelingCustomTabProps,
};
Empty file.
9 changes: 9 additions & 0 deletions ui/src/pages/Sidebar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,15 @@ const SideNav = () => {
renderItem: (props) => <Link {...props} to={`${baseUrl}/data-set`} />,
isSelected: useMatchSubpath(`${baseUrl}/data-set`),
},
{
name: "Document Labeling",
id: htmlIdGenerator("documentLabeling")(),
icon: <EuiIcon type="documentEdit" />,
renderItem: (props) => (
<Link {...props} to={`${baseUrl}/document-labeling`} />
),
isSelected: useMatchSubpath(`${baseUrl}/document-labeling`),
},
{
name: "Permissions",
id: htmlIdGenerator("permissions")(),
Expand Down
Loading