feast-dev · franciscojavierarceo · Jun 2, 2025 · May 24, 2025 · May 24, 2025 · May 24, 2025
@@ -56,7 +56,6 @@ The transformation workflow typically involves:
 3. **Chunking**: Split documents into smaller, semantically meaningful chunks
 4. **Embedding Generation**: Convert text chunks into vector embeddings
 5. **Storage**: Store embeddings and metadata in Feast's feature store
-
 ### Feature Transformation for LLMs
 
 Feast supports transformations that can be used to:
@@ -66,6 +65,99 @@ Feast supports transformations that can be used to:
 * Normalize and preprocess features before serving to LLMs
 * Apply custom transformations to adapt features for specific LLM requirements
 
+## Getting Started with Feast for GenAI
+
+### Installation
+
+To use Feast with vector database support, install with the appropriate extras:
+
+```bash
+# For Milvus support
+pip install feast[milvus,nlp]
+
+# For Elasticsearch support
+pip install feast[elasticsearch]
+
+# For Qdrant support
+pip install feast[qdrant]
+
+# For SQLite support (Python 3.10 only)
+pip install feast[sqlite_vec]
+```
+
+### Configuration
+
+Configure your feature store to use a vector database as the online store:
+
+```yaml
+project: genai-project
+provider: local
+registry: data/registry.db
+online_store:
+  type: milvus
+  path: data/online_store.db
+  vector_enabled: true
+  embedding_dim: 384  # Adjust based on your embedding model
+  index_type: "IVF_FLAT"
+
+offline_store:
+  type: file
+entity_key_serialization_version: 3
+```
+
+### Defining Vector Features
+
+Create feature views with vector index support:
+
+```python
+from feast import FeatureView, Field, Entity
+from feast.types import Array, Float32, String
+
+document = Entity(
+    name="document_id",
+    description="Document identifier",
+    join_keys=["document_id"],
+)
+
+document_embeddings = FeatureView(
+    name="document_embeddings",
+    entities=[document],
+    schema=[
+        Field(
+            name="vector",
+            dtype=Array(Float32),
+            vector_index=True,  # Enable vector search
+            vector_search_metric="COSINE",  # Similarity metric
+        ),
+        Field(name="document_id", dtype=String),
+        Field(name="content", dtype=String),
+    ],
+    source=document_source,
+    ttl=timedelta(days=30),
+)
+```
+
+### Retrieving Similar Documents
+
+Use the `retrieve_online_documents_v2` method to find similar documents:
+
+```python
+# Generate query embedding
+query = "How does Feast support vector databases?"
+query_embedding = embed_text(query)  # Your embedding function
+
+# Retrieve similar documents
+context_data = store.retrieve_online_documents_v2(
+    features=[
+        "document_embeddings:vector",
+        "document_embeddings:document_id",
+        "document_embeddings:content",
+    ],
+    query=query_embedding,
+    top_k=3,
+    distance_metric='COSINE',
+).to_df()
+```
 ## Use Cases
 
 ### Document Question-Answering
@@ -104,7 +196,6 @@ This integration enables:
 - Generating embeddings for millions of text chunks
 - Efficiently materializing features to vector databases
 - Scaling RAG applications to enterprise-level document repositories
-
 ## Learn More
 
 For more detailed information and examples:

@@ -0,0 +1,96 @@
+import json
+from typing import Any, Dict, List, Optional
+
+from feast.feature import Feature
+
+
+class DocumentLabel:
+    def __init__(
+        self,
+        chunk_id: str,
+        document_id: str,
+        label: str,
+        confidence: Optional[float] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ):
+        self.chunk_id = chunk_id
+        self.document_id = document_id
+        self.label = label
+        self.confidence = confidence
+        self.metadata = metadata or {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "chunk_id": self.chunk_id,
+            "document_id": self.document_id,
+            "label": self.label,
+            "confidence": self.confidence,
+            "metadata": self.metadata,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "DocumentLabel":
+        return cls(
+            chunk_id=data["chunk_id"],
+            document_id=data["document_id"],
+            label=data["label"],
+            confidence=data.get("confidence"),
+            metadata=data.get("metadata", {}),
+        )
+
+
+def store_document_label(feature: Feature, label: DocumentLabel) -> None:
+    if not hasattr(feature, "labels") or feature.labels is None:
+        if hasattr(feature, "_labels"):
+            feature._labels = {}
+        else:
+            return
+
+    labels_dict = feature.labels if hasattr(feature, "labels") else feature._labels
+    labels_key = "document_labels"
+    if labels_key not in labels_dict:
+        labels_dict[labels_key] = "[]"
+
+    existing_labels = json.loads(labels_dict[labels_key])
+    existing_labels.append(label.to_dict())
+    labels_dict[labels_key] = json.dumps(existing_labels)
+
+
+def get_document_labels(feature: Feature) -> List[DocumentLabel]:
+    labels_dict = None
+    if hasattr(feature, "labels") and feature.labels:
+        labels_dict = feature.labels
+    elif hasattr(feature, "_labels") and feature._labels:
+        labels_dict = feature._labels
+
+    if not labels_dict or "document_labels" not in labels_dict:
+        return []
+
+    labels_data = json.loads(labels_dict["document_labels"])
+    return [DocumentLabel.from_dict(label_dict) for label_dict in labels_data]
+
+
+def remove_document_label(feature: Feature, chunk_id: str, document_id: str) -> bool:
+    labels_dict = None
+    if hasattr(feature, "labels") and feature.labels:
+        labels_dict = feature.labels
+    elif hasattr(feature, "_labels") and feature._labels:
+        labels_dict = feature._labels
+
+    if not labels_dict or "document_labels" not in labels_dict:
+        return False
+
+    existing_labels = json.loads(labels_dict["document_labels"])
+    original_length = len(existing_labels)
+
+    filtered_labels = [
+        label
+        for label in existing_labels
+        if not (label["chunk_id"] == chunk_id and label["document_id"] == document_id)
+    ]
+
+    if len(filtered_labels) < original_length:
+        labels_dict["document_labels"] = json.dumps(filtered_labels)
+        return True
+
+    return False
@@ -101,6 +101,10 @@ class ChatRequest(BaseModel):
     messages: List[ChatMessage]
 
 
+class ReadDocumentRequest(BaseModel):
+    file_path: str
+
+
 def _get_features(request: GetOnlineFeaturesRequest, store: "feast.FeatureStore"):
     if request.feature_service:
         feature_service = store.get_feature_service(
@@ -356,6 +360,21 @@ async def chat(request: ChatRequest):
         # For now, just return dummy text
         return {"response": "This is a dummy response from the Feast feature server."}
 
+    @app.post("/read-document")
+    async def read_document_endpoint(request: ReadDocumentRequest):
+        try:
+            import os
+
+            if not os.path.exists(request.file_path):
+                return {"error": f"File not found: {request.file_path}"}
+
+            with open(request.file_path, "r", encoding="utf-8") as file:
+                content = file.read()
+
+            return {"content": content, "file_path": request.file_path}
+        except Exception as e:
+            return {"error": str(e)}
+
     @app.get("/chat")
     async def chat_ui():
         # Serve the chat UI

@@ -22,6 +22,7 @@ import FeatureServiceInstance from "./pages/feature-services/FeatureServiceInsta
 import DataSourceInstance from "./pages/data-sources/DataSourceInstance";
 import RootProjectSelectionPage from "./pages/RootProjectSelectionPage";
 import DatasetInstance from "./pages/saved-data-sets/DatasetInstance";
+import DocumentLabelingPage from "./pages/document-labeling/DocumentLabelingPage";
 import PermissionsIndex from "./pages/permissions/Index";
 import LineageIndex from "./pages/lineage/Index";
 import NoProjectGuard from "./components/NoProjectGuard";
@@ -145,6 +146,10 @@ const FeastUISansProvidersInner = ({
                       path="data-set/:datasetName/*"
                       element={<DatasetInstance />}
                     />
+                    <Route
+                      path="document-labeling/"
+                      element={<DocumentLabelingPage />}
+                    />
                     <Route path="permissions/" element={<PermissionsIndex />} />
                     <Route path="lineage/" element={<LineageIndex />} />
                   </Route>

@@ -289,6 +289,7 @@ export {
   useDataSourceCustomTabs,
   useEntityCustomTabs,
   useDatasetCustomTabs,
+
   // Routes
   useRegularFeatureViewCustomTabRoutes,
   useOnDemandFeatureViewCustomTabRoutes,

@@ -136,6 +136,20 @@ interface DatasetCustomTabRegistrationInterface
   }: DatasetCustomTabProps) => JSX.Element;
 }
 
+// Type for Document Labeling Custom Tabs
+interface DocumentLabelingCustomTabProps {
+  id: string | undefined;
+  feastObjectQuery: RegularFeatureViewQueryReturnType;
+}
+interface DocumentLabelingCustomTabRegistrationInterface
+  extends CustomTabRegistrationInterface {
+  Component: ({
+    id,
+    feastObjectQuery,
+    ...args
+  }: DocumentLabelingCustomTabProps) => JSX.Element;
+}
+
 export type {
   CustomTabRegistrationInterface,
   RegularFeatureViewQueryReturnType,
@@ -157,4 +171,6 @@ export type {
   FeatureCustomTabProps,
   DatasetCustomTabRegistrationInterface,
   DatasetCustomTabProps,
+  DocumentLabelingCustomTabRegistrationInterface,
+  DocumentLabelingCustomTabProps,
 };
@@ -131,6 +131,15 @@ const SideNav = () => {
           renderItem: (props) => <Link {...props} to={`${baseUrl}/data-set`} />,
           isSelected: useMatchSubpath(`${baseUrl}/data-set`),
         },
+        {
+          name: "Document Labeling",
+          id: htmlIdGenerator("documentLabeling")(),
+          icon: <EuiIcon type="documentEdit" />,
+          renderItem: (props) => (
+            <Link {...props} to={`${baseUrl}/document-labeling`} />
+          ),
+          isSelected: useMatchSubpath(`${baseUrl}/document-labeling`),
+        },
         {
           name: "Permissions",
           id: htmlIdGenerator("permissions")(),