Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ The Python SDK documentation is available at: https://docs.feldera.com/python
uv pip install feldera
```

For Arrow IPC query support, install the optional Arrow extra:

```bash
uv pip install 'feldera[arrow]'
```

### Example usage

The Python client interacts with the API server of the Feldera instance.
Expand Down
24 changes: 23 additions & 1 deletion python/feldera/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import deque
from datetime import datetime
from threading import Event
from typing import Any, Callable, Dict, Generator, List, Mapping, Optional
from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Mapping, Optional
from uuid import UUID

import pandas
Expand Down Expand Up @@ -36,6 +36,9 @@
from feldera.stats import InputEndpointStatus, OutputEndpointStatus, PipelineStatistics
from feldera.types import CheckpointMetadata

if TYPE_CHECKING:
import pyarrow as pa


class Pipeline:
def __init__(self, client: FelderaClient):
Expand Down Expand Up @@ -975,6 +978,25 @@ def query_parquet(self, query: str, path: str):

self.client.query_as_parquet(self.name, query, path)

def query_arrow(self, query: str) -> Generator["pa.RecordBatch", None, None]:
"""
Executes an ad-hoc SQL query on this pipeline and returns a generator
that yields the result as PyArrow RecordBatches.

Note:
You can only ``SELECT`` from materialized tables and views.

:param query: The SQL query to be executed.

:raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED
state.
:raises FelderaAPIError: If querying a non materialized table or view.
:raises FelderaAPIError: If the query is invalid.

:return: A generator that yields ``pyarrow.RecordBatch`` objects.
"""
return self.client.query_as_arrow(self.name, query)

def query_tabular(self, query: str) -> Generator[str, None, None]:
"""
Executes a SQL query on this pipeline and returns the result as a
Expand Down
47 changes: 46 additions & 1 deletion python/feldera/rest/feldera_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pathlib
import time
from decimal import Decimal
from typing import Any, Dict, Generator, Mapping, Optional
from typing import TYPE_CHECKING, Any, Dict, Generator, Mapping, Optional
from urllib.parse import quote

import requests
Expand All @@ -18,6 +18,9 @@

logger = logging.getLogger(__name__)

if TYPE_CHECKING:
import pyarrow as pa


def _validate_no_none_keys_in_map(data):
def validate_no_none_keys(d: Dict[Any, Any]) -> None:
Expand All @@ -38,6 +41,17 @@ def _prepare_boolean_input(value: bool) -> str:
return "true" if value else "false"


def _import_pyarrow_ipc():
try:
import pyarrow.ipc as ipc
except ImportError as exc:
raise ImportError(
"pyarrow is required for Arrow IPC queries. Install it with `pip install feldera[arrow]`."
) from exc

return ipc


class FelderaClient:
"""
A client for the Feldera HTTP API.
Expand Down Expand Up @@ -1217,6 +1231,37 @@ def query_as_parquet(self, pipeline_name: str, query: str, path: str):
file.write(chunk)
file.close()

def query_as_arrow(
self, pipeline_name: str, query: str
) -> Generator["pa.RecordBatch", None, None]:
"""
Executes an ad-hoc query on the specified pipeline and returns the result
as a generator that yields PyArrow RecordBatches.

:param pipeline_name: The name of the pipeline to query.
:param query: The SQL query to be executed.
:return: A generator that yields each query batch as a ``pyarrow.RecordBatch``.
"""
ipc = _import_pyarrow_ipc()

params = {
"pipeline_name": pipeline_name,
"sql": query,
"format": "arrow_ipc",
}
resp: requests.Response = self.http.get(
path=f"/pipelines/{pipeline_name}/query",
params=params,
stream=True,
)

try:
with ipc.open_stream(resp.raw) as reader:
for batch in reader:
yield batch
finally:
resp.close()

def query_as_json(
self, pipeline_name: str, query: str
) -> Generator[Mapping[str, Any], None, None]:
Expand Down
9 changes: 8 additions & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ dependencies = [
"ruff>=0.6.9",
"PyJWT>=2.12.0",
]

[project.optional-dependencies]
arrow = [
"pyarrow>=14.0",
]

[project.urls]
Homepage = "https://www.feldera.com"
Documentation = "https://docs.feldera.com/python"
Expand All @@ -43,7 +49,8 @@ dev = [
"sphinx-rtd-theme==2.0.0",
"sphinx==7.3.7",
"simplejson==3.20.1",
"confluent-kafka>=2.2.0"
"confluent-kafka>=2.2.0",
"pyarrow>=14.0",
]

[tool.pytest.ini_options]
Expand Down
33 changes: 32 additions & 1 deletion python/tests/platform/test_shared_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import gzip
import io
import json
import os
Expand All @@ -6,9 +7,9 @@
import time
import unittest
import zipfile
import gzip

import pandas as pd
import pytest

from feldera import Pipeline
from feldera.enums import CompletionTokenStatus, PipelineFieldSelector, PipelineStatus
Expand Down Expand Up @@ -167,6 +168,36 @@ def test_adhoc_query_json(self):
got = list(resp)
self.assertCountEqual(got, expected)

def test_adhoc_query_arrow(self):
pa = pytest.importorskip("pyarrow")

data = "1\n2\n"
self.pipeline.start()
TEST_CLIENT.push_to_pipeline(self.pipeline.name, "tbl", "csv", data)

expected_rows = list(
TEST_CLIENT.query_as_json(
self.pipeline.name,
"SELECT * FROM tbl ORDER BY id",
)
)
expected_ids = [row["id"] for row in expected_rows]

batches_client = list(
TEST_CLIENT.query_as_arrow(
self.pipeline.name,
"SELECT * FROM tbl ORDER BY id",
)
)
table_client = pa.Table.from_batches(batches_client)
assert table_client.column("id").to_pylist() == expected_ids

batches_pipeline = list(
self.pipeline.query_arrow("SELECT * FROM tbl ORDER BY id")
)
table_pipeline = pa.Table.from_batches(batches_pipeline)
assert table_pipeline.column("id").to_pylist() == expected_ids

def test_local(self):
"""
CREATE TABLE students (
Expand Down
143 changes: 143 additions & 0 deletions python/tests/unit/test_query_as_arrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""Unit tests for FelderaClient.query_as_arrow and Pipeline.query_arrow."""

import builtins
import io
import sys
from unittest.mock import MagicMock

import pytest

from feldera.rest.feldera_client import FelderaClient


def _import_arrow_modules():
pa = pytest.importorskip("pyarrow")
ipc = pytest.importorskip("pyarrow.ipc")
return pa, ipc


def _make_ipc_bytes(table) -> bytes:
"""Serialise a ``pyarrow.Table`` to Arrow IPC stream bytes."""
_, ipc = _import_arrow_modules()
buf = io.BytesIO()
with ipc.new_stream(buf, table.schema) as writer:
if table.num_rows > 0:
writer.write_table(table)
return buf.getvalue()


def _mock_response(ipc_bytes: bytes) -> MagicMock:
"""Return a mock response whose ``raw`` is an Arrow IPC byte stream."""
resp = MagicMock()
resp.raw = io.BytesIO(ipc_bytes)
return resp


@pytest.fixture()
def client() -> FelderaClient:
"""A ``FelderaClient`` with a mocked HTTP layer (no real network calls)."""
c = FelderaClient.__new__(FelderaClient)
c.http = MagicMock()
return c


class TestQueryAsArrow:
def test_non_empty_result_yields_correct_data(self, client: FelderaClient):
pa, _ = _import_arrow_modules()
schema = pa.schema([("id", pa.int64()), ("name", pa.utf8())])
expected = pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]}, schema=schema)
client.http.get.return_value = _mock_response(_make_ipc_bytes(expected))

batches = list(client.query_as_arrow("my_pipeline", "SELECT id, name FROM t"))
result = pa.Table.from_batches(batches, schema=schema)

assert len(batches) > 0
assert result.schema == schema
assert result.num_rows == 3
assert result.column("id").to_pylist() == [1, 2, 3]
assert result.column("name").to_pylist() == ["a", "b", "c"]

def test_http_called_with_correct_params(self, client: FelderaClient):
pa, _ = _import_arrow_modules()
schema = pa.schema([("id", pa.int64())])
table = pa.table({"id": [42]}, schema=schema)
client.http.get.return_value = _mock_response(_make_ipc_bytes(table))

list(client.query_as_arrow("my_pipeline", "SELECT id FROM t"))

client.http.get.assert_called_once_with(
path="/pipelines/my_pipeline/query",
params={
"pipeline_name": "my_pipeline",
"sql": "SELECT id FROM t",
"format": "arrow_ipc",
},
stream=True,
)

def test_empty_result_yields_no_batches(self, client: FelderaClient):
pa, _ = _import_arrow_modules()
schema = pa.schema([("id", pa.int64()), ("value", pa.float64())])
empty = pa.table(
{
"id": pa.array([], type=pa.int64()),
"value": pa.array([], type=pa.float64()),
},
schema=schema,
)
client.http.get.return_value = _mock_response(_make_ipc_bytes(empty))

result_batches = list(
client.query_as_arrow("my_pipeline", "SELECT id, value FROM t WHERE false")
)

assert result_batches == []

def test_missing_pyarrow_raises_helpful_import_error(
self, client: FelderaClient, monkeypatch
):
real_import = builtins.__import__

def _import(name, globals=None, locals=None, fromlist=(), level=0):
if name == "pyarrow" or name.startswith("pyarrow."):
raise ImportError("No module named 'pyarrow'")
return real_import(name, globals, locals, fromlist, level)

monkeypatch.delitem(sys.modules, "pyarrow", raising=False)
monkeypatch.delitem(sys.modules, "pyarrow.ipc", raising=False)
monkeypatch.setattr(builtins, "__import__", _import)

with pytest.raises(ImportError, match="pip install feldera\\[arrow\\]"):
next(client.query_as_arrow("my_pipeline", "SELECT 1"))

client.http.get.assert_not_called()

def test_response_closed_after_full_consumption(self, client: FelderaClient):
pa, _ = _import_arrow_modules()
schema = pa.schema([("id", pa.int64())])
table = pa.table({"id": [1, 2]}, schema=schema)
resp = _mock_response(_make_ipc_bytes(table))
client.http.get.return_value = resp

list(client.query_as_arrow("my_pipeline", "SELECT id FROM t"))

resp.close.assert_called_once()


class TestPipelineQueryArrow:
def test_query_arrow_delegates_to_client(self):
"""Pipeline.query_arrow must forward to client.query_as_arrow."""
from feldera.pipeline import Pipeline

pipeline = Pipeline.__new__(Pipeline)
pipeline._inner = MagicMock()
pipeline._inner.name = "pipe1"
pipeline.client = MagicMock()

expected = object()
pipeline.client.query_as_arrow.return_value = expected

result = pipeline.query_arrow("SELECT x FROM v")

pipeline.client.query_as_arrow.assert_called_once_with("pipe1", "SELECT x FROM v")
assert result is expected
Loading