Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions sdk/python/feast/feature_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,7 @@ def _add_mcp_support_if_enabled(app, store: "feast.FeatureStore"):

class FeastServeApplication(gunicorn.app.base.BaseApplication):
def __init__(self, store: "feast.FeatureStore", **options):
self._store = store
self._app = get_app(
store=store,
registry_ttl_sec=options["registry_ttl_sec"],
Expand All @@ -645,6 +646,24 @@ def load_config(self):

self.cfg.set("worker_class", "uvicorn_worker.UvicornWorker")

# Register post_fork hook for fork-safety with SQL Registry
# This ensures each worker reinitializes database connections
# and background threads after forking
self.cfg.set("post_fork", self._post_fork_hook)

def _post_fork_hook(self, server, worker):
"""
Gunicorn post_fork hook called in each worker after fork.

This is critical for fork-safety when using SQL Registry backends.
SQLAlchemy connection pools and threading.Timer objects are not
fork-safe and must be reinitialized in each worker process.
"""
logger.debug(f"Worker {worker.pid} initializing after fork")
if hasattr(self._store, "registry") and self._store.registry is not None:
self._store.registry.on_worker_init()
logger.debug(f"Worker {worker.pid} registry reinitialized")

def load(self):
return self._app

Expand Down
162 changes: 160 additions & 2 deletions sdk/python/feast/infra/offline_stores/duckdb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, List, Optional, Union
from typing import Any, Callable, List, Literal, Optional, Union

import ibis
import pandas as pd
Expand All @@ -26,8 +27,96 @@
from feast.infra.registry.base_registry import BaseRegistry
from feast.repo_config import FeastConfigBaseModel, RepoConfig

logger = logging.getLogger(__name__)

# Track whether S3 has been configured for the current DuckDB connection
_s3_configured = False


def _configure_duckdb_for_s3(config: "DuckDBOfflineStoreConfig") -> None:
"""
Configure the DuckDB connection for S3 access.

This function configures DuckDB's HTTPFS extension with S3 settings.
It's designed to be called once before any S3 operations.

Args:
config: DuckDB offline store configuration containing S3 settings.
"""
global _s3_configured

# Check if any S3 settings are configured
has_s3_settings = any(
[
config.s3_url_style,
config.s3_endpoint,
config.s3_access_key_id,
config.s3_secret_access_key,
config.s3_region,
config.s3_use_ssl is not None,
]
)

if not has_s3_settings:
return

if _s3_configured:
return

try:
# Get the default DuckDB connection from ibis
con = ibis.get_backend()

# Install and load httpfs extension for S3 support
con.raw_sql("INSTALL httpfs;")
con.raw_sql("LOAD httpfs;")

# Configure S3 settings
if config.s3_url_style:
con.raw_sql(f"SET s3_url_style='{config.s3_url_style}';")
logger.debug(f"DuckDB S3 url_style set to '{config.s3_url_style}'")

if config.s3_endpoint:
con.raw_sql(f"SET s3_endpoint='{config.s3_endpoint}';")
logger.debug(f"DuckDB S3 endpoint set to '{config.s3_endpoint}'")

if config.s3_access_key_id:
con.raw_sql(f"SET s3_access_key_id='{config.s3_access_key_id}';")
logger.debug("DuckDB S3 access_key_id configured")

if config.s3_secret_access_key:
con.raw_sql(f"SET s3_secret_access_key='{config.s3_secret_access_key}';")
logger.debug("DuckDB S3 secret_access_key configured")

if config.s3_region:
con.raw_sql(f"SET s3_region='{config.s3_region}';")
logger.debug(f"DuckDB S3 region set to '{config.s3_region}'")

if config.s3_use_ssl is not None:
ssl_value = "true" if config.s3_use_ssl else "false"
con.raw_sql(f"SET s3_use_ssl={ssl_value};")
logger.debug(f"DuckDB S3 use_ssl set to {ssl_value}")

_s3_configured = True
logger.info("DuckDB S3 configuration completed successfully")

except Exception as e:
logger.warning(f"Failed to configure DuckDB for S3: {e}")
# Don't raise - let the operation continue and potentially fail with a more specific error


def _is_s3_path(path: str) -> bool:
"""Check if the given path is an S3 path."""
return path.startswith("s3://") or path.startswith("s3a://")


def _read_data_source(data_source: DataSource, repo_path: str) -> Table:
"""
Read data from a FileSource into an ibis Table.

Note: S3 configuration must be set up before calling this function
by calling _configure_duckdb_for_s3() from the DuckDBOfflineStore methods.
"""
assert isinstance(data_source, FileSource)

if isinstance(data_source.file_format, ParquetFormat):
Expand Down Expand Up @@ -113,12 +202,66 @@ def _write_data_source(


class DuckDBOfflineStoreConfig(FeastConfigBaseModel):
"""Configuration for DuckDB offline store.

Attributes:
type: Offline store type selector. Must be "duckdb".
staging_location: Optional S3 path for staging data during remote exports.
staging_location_endpoint_override: Custom S3 endpoint for staging location.
s3_url_style: S3 URL style - "path" for path-style URLs (required for
MinIO, LocalStack, etc.) or "vhost" for virtual-hosted style.
Default is None which uses DuckDB's default (vhost).
s3_endpoint: Custom S3 endpoint URL (e.g., "localhost:9000" for MinIO).
s3_access_key_id: AWS access key ID for S3 authentication.
If not set, uses AWS credential chain.
s3_secret_access_key: AWS secret access key for S3 authentication.
If not set, uses AWS credential chain.
s3_region: AWS region for S3 access (e.g., "us-east-1").
Required for some S3-compatible providers.
s3_use_ssl: Whether to use SSL for S3 connections.
Default is None which uses DuckDB's default (true).

Example:
For MinIO or other S3-compatible storage that requires path-style URLs:

.. code-block:: yaml

offline_store:
type: duckdb
s3_url_style: path
s3_endpoint: localhost:9000
s3_access_key_id: minioadmin
s3_secret_access_key: minioadmin
s3_region: us-east-1
s3_use_ssl: false
"""

type: StrictStr = "duckdb"
# """ Offline store type selector"""

staging_location: Optional[str] = None
"""S3 path for staging data during remote exports."""

staging_location_endpoint_override: Optional[str] = None
"""Custom S3 endpoint for staging location."""

# S3 configuration options for DuckDB's HTTPFS extension
s3_url_style: Optional[Literal["path", "vhost"]] = None
"""S3 URL style - 'path' for path-style or 'vhost' for virtual-hosted style."""

s3_endpoint: Optional[str] = None
"""Custom S3 endpoint URL (e.g., 'localhost:9000' for MinIO)."""

s3_access_key_id: Optional[str] = None
"""AWS access key ID. If not set, uses AWS credential chain."""

s3_secret_access_key: Optional[str] = None
"""AWS secret access key. If not set, uses AWS credential chain."""

s3_region: Optional[str] = None
"""AWS region (e.g., 'us-east-1'). Required for some S3-compatible providers."""

s3_use_ssl: Optional[bool] = None
"""Whether to use SSL for S3 connections. Default uses DuckDB's default (true)."""


class DuckDBOfflineStore(OfflineStore):
Expand All @@ -133,6 +276,9 @@ def pull_latest_from_table_or_query(
start_date: datetime,
end_date: datetime,
) -> RetrievalJob:
# Configure S3 settings for DuckDB before reading data
_configure_duckdb_for_s3(config.offline_store)

return pull_latest_from_table_or_query_ibis(
config=config,
data_source=data_source,
Expand All @@ -158,6 +304,9 @@ def get_historical_features(
project: str,
full_feature_names: bool = False,
) -> RetrievalJob:
# Configure S3 settings for DuckDB before reading data
_configure_duckdb_for_s3(config.offline_store)

return get_historical_features_ibis(
config=config,
feature_views=feature_views,
Expand All @@ -183,6 +332,9 @@ def pull_all_from_table_or_query(
start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
) -> RetrievalJob:
# Configure S3 settings for DuckDB before reading data
_configure_duckdb_for_s3(config.offline_store)

return pull_all_from_table_or_query_ibis(
config=config,
data_source=data_source,
Expand All @@ -205,6 +357,9 @@ def offline_write_batch(
table: pyarrow.Table,
progress: Optional[Callable[[int], Any]],
):
# Configure S3 settings for DuckDB before writing data
_configure_duckdb_for_s3(config.offline_store)

offline_write_batch_ibis(
config=config,
feature_view=feature_view,
Expand All @@ -221,6 +376,9 @@ def write_logged_features(
logging_config: LoggingConfig,
registry: BaseRegistry,
):
# Configure S3 settings for DuckDB before writing data
_configure_duckdb_for_s3(config.offline_store)

write_logged_features_ibis(
config=config,
data=data,
Expand Down
17 changes: 17 additions & 0 deletions sdk/python/feast/infra/registry/base_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,23 @@ def refresh(self, project: Optional[str] = None):
"""Refreshes the state of the registry cache by fetching the registry state from the remote registry store."""
raise NotImplementedError

def on_worker_init(self):
"""
Called after a worker process has been forked to reinitialize resources.

This method is critical for fork-safety when using multi-worker servers
like Gunicorn. Resources like database connection pools, threads, and
file handles are not fork-safe and must be reinitialized in child processes.

Subclasses should override this method to:
- Dispose and recreate database connection pools
- Stop and restart background threads
- Reinitialize any other fork-unsafe resources

This is a no-op by default for registries that don't need special handling.
"""
pass

# Lineage operations
def get_registry_lineage(
self,
Expand Down
26 changes: 26 additions & 0 deletions sdk/python/feast/infra/registry/caching_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,3 +487,29 @@ def _start_thread_async_refresh(self, cache_ttl_seconds):

def _exit_handler(self):
self.registry_refresh_thread.cancel()

def on_worker_init(self):
"""
Called after a worker process has been forked to reinitialize resources.

For CachingRegistry, this method:
1. Cancels any inherited refresh thread from the parent process
2. Restarts the refresh thread if cache_mode is "thread"

This ensures each worker has its own independent refresh thread.
"""
# Cancel any inherited timer from parent process
if hasattr(self, "registry_refresh_thread") and self.registry_refresh_thread:
try:
self.registry_refresh_thread.cancel()
except Exception:
pass # Timer may already be invalid after fork

# Restart refresh thread if using thread cache mode
if self.cache_mode == "thread":
cache_ttl_seconds = int(self.cached_registry_proto_ttl.total_seconds())
if cache_ttl_seconds > 0:
self._start_thread_async_refresh(cache_ttl_seconds)
logger.debug(
"CachingRegistry refresh thread restarted after fork"
)
68 changes: 57 additions & 11 deletions sdk/python/feast/infra/registry/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,17 +250,7 @@ def __init__(
), "SqlRegistry needs a valid registry_config"

self.registry_config = registry_config

self.write_engine: Engine = create_engine(
registry_config.path, **registry_config.sqlalchemy_config_kwargs
)
if registry_config.read_path:
self.read_engine: Engine = create_engine(
registry_config.read_path,
**registry_config.sqlalchemy_config_kwargs,
)
else:
self.read_engine = self.write_engine
self._init_engines()
metadata.create_all(self.write_engine)
self.thread_pool_executor_worker_count = (
registry_config.thread_pool_executor_worker_count
Expand All @@ -278,6 +268,62 @@ def __init__(
if not self.purge_feast_metadata:
self._maybe_init_project_metadata(project)

def _init_engines(self):
"""Initialize SQLAlchemy engines. Can be called to reinitialize after fork."""
self.write_engine: Engine = create_engine(
self.registry_config.path, **self.registry_config.sqlalchemy_config_kwargs
)
if self.registry_config.read_path:
self.read_engine: Engine = create_engine(
self.registry_config.read_path,
**self.registry_config.sqlalchemy_config_kwargs,
)
else:
self.read_engine = self.write_engine

def reinitialize_engines(self):
"""
Reinitialize SQLAlchemy engines after a process fork.

This method is critical for fork-safety when using multi-worker servers
like Gunicorn. SQLAlchemy's connection pools are not fork-safe - the
internal state (locks, conditions, connections) becomes corrupted when
a process forks. This method disposes the old engines and creates fresh
ones with new connection pools.

Should be called in a post_fork hook when running with multiple workers.
"""
# Dispose existing engines to clean up connection pools
if hasattr(self, "write_engine") and self.write_engine is not None:
self.write_engine.dispose()
if (
hasattr(self, "read_engine")
and self.read_engine is not None
and self.read_engine is not self.write_engine
):
self.read_engine.dispose()

# Reinitialize with fresh engines
self._init_engines()
logger.debug("SqlRegistry engines reinitialized after fork")

def on_worker_init(self):
"""
Called after a worker process has been forked to reinitialize resources.

For SqlRegistry, this method:
1. Reinitializes SQLAlchemy engines (disposes old pools, creates new ones)
2. Calls parent class to handle refresh thread restart

This is critical for fork-safety when using multi-worker servers like
Gunicorn with SQL Registry backends.
"""
# First reinitialize the database engines
self.reinitialize_engines()

# Then call parent to handle refresh thread
super().on_worker_init()

def _sync_feast_metadata_to_projects_table(self):
feast_metadata_projects: dict = {}
projects_set: set = []
Expand Down
Loading