Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion python/feldera/_callback_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(
view_name: str,
callback: Callable[[pd.DataFrame, int], None],
queue: Optional[Queue],
case_sensitive: bool,
):
super().__init__()
self.daemon = True
Expand All @@ -30,6 +31,7 @@ def __init__(
self.callback: Callable[[pd.DataFrame, int], None] = callback
self.queue: Optional[Queue] = queue
self.schema: Optional[dict] = None
self.case_sensitive: bool = case_sensitive

def run(self):
"""
Expand Down Expand Up @@ -66,7 +68,10 @@ def run(self):
case _CallbackRunnerInstruction.PipelineStarted:
# listen to the pipeline
gen_obj = self.client.listen_to_pipeline(
self.pipeline_name, self.view_name, format="json"
self.pipeline_name,
self.view_name,
format="json",
case_sensitive=self.case_sensitive,
)

# if there is a queue set up, inform the main thread that the listener has been started, and it can
Expand Down
8 changes: 7 additions & 1 deletion python/feldera/output_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(
pipeline_name: str,
view_name: str,
queue: Optional[Queue],
case_sensitive: bool,
):
"""
Initializes the output handler, but doesn't start it.
Expand All @@ -32,7 +33,12 @@ def callback(df: pd.DataFrame, _: int):

# sets up the callback runner
self.handler = CallbackRunner(
self.client, self.pipeline_name, self.view_name, callback, queue
self.client,
self.pipeline_name,
self.view_name,
callback,
queue,
case_sensitive,
)

def start(self):
Expand Down
69 changes: 64 additions & 5 deletions python/feldera/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from datetime import datetime

import pandas
import warnings
import pyarrow

from typing import List, Dict, Callable, Optional, Generator, Mapping, Any
from typing import List, Dict, Callable, Optional, Generator, Mapping, Any, Tuple
from collections import deque
from queue import Queue

Expand Down Expand Up @@ -204,7 +206,7 @@ def resume_connector(self, table_name: str, connector_name: str):

self.client.resume_connector(self.name, table_name, connector_name)

def listen(self, view_name: str) -> OutputHandler:
def listen(self, view_name: str, case_sensitive: bool = False) -> OutputHandler:
"""
Follow the change stream (i.e., the output) of the provided view.
Returns an output handler to read the changes.
Expand All @@ -215,6 +217,7 @@ def listen(self, view_name: str) -> OutputHandler:
If this method is called once the pipeline has started, you will only get the output from that point onwards.

:param view_name: The name of the view to listen to.
:param case_sensitive: True if the view name is case sensitive.
"""

queue: Optional[Queue] = None
Expand All @@ -223,13 +226,18 @@ def listen(self, view_name: str) -> OutputHandler:
queue = Queue(maxsize=1)
self.views_tx.append({view_name: queue})

handler = OutputHandler(self.client, self.name, view_name, queue)
handler = OutputHandler(
self.client, self.name, view_name, queue, case_sensitive
)
handler.start()

return handler

def foreach_chunk(
self, view_name: str, callback: Callable[[pandas.DataFrame, int], None]
self,
view_name: str,
callback: Callable[[pandas.DataFrame, int], None],
case_sensitive: bool = False,
):
"""
Run the given callback on each chunk of the output of the specified view.
Expand All @@ -244,6 +252,8 @@ def foreach_chunk(
- **seq_no** -> The sequence number. The sequence number is a monotonically increasing integer that
starts from 0. Note that the sequence number is unique for each chunk, but not necessarily contiguous.

:param case_sensitive: True if the view name is case sensitive.

Please note that the callback is run in a separate thread, so it should be thread-safe.
Please note that the callback should not block for a long time, as by default, backpressure is enabled and
will block the pipeline.
Expand All @@ -259,7 +269,9 @@ def foreach_chunk(
queue = Queue(maxsize=1)
self.views_tx.append({view_name: queue})

handler = CallbackRunner(self.client, self.name, view_name, callback, queue)
handler = CallbackRunner(
self.client, self.name, view_name, callback, queue, case_sensitive
)
handler.start()

def wait_for_completion(
Expand Down Expand Up @@ -692,9 +704,56 @@ def query(self, query: str) -> Generator[Mapping[str, Any], None, None]:
:raises FelderaAPIError: If querying a non materialized table or view.
:raises FelderaAPIError: If the query is invalid.
"""
warnings.warn(
"function query is deprecated and will be removed in a future version",
category=DeprecationWarning,
stacklevel=2,
)

return self.client.query_as_json(self.name, query)

def query_pyarrow(self, query: str) -> pyarrow.Table:
"""
Executes an ad-hoc SQL query on this pipeline and returns a
:class:`.pyarrow.Table` containing all the results.

Note:
You can only ``SELECT`` from materialized tables and views.

:param query: The SQL query to be executed.

:raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED
state.
:raises FelderaAPIError: If querying a non materialized table or view.
:raises FelderaAPIError: If the query is invalid.
"""

return self.client.query_as_pyarrow(self.name, query)

def query_pylist(self, query: str) -> List[List[Tuple[str, Any]]]:
"""
Executes an ad-hoc SQL query on this pipeline and returns the results
as a list of rows. Each row is represented as a list of
(column_name, value) tuples.

Note:
You can only ``SELECT`` from materialized tables and views.

:param query: The SQL query to be executed.

:return: A list of rows, where each row is a list of
(column_name, value) tuples.

:raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED
state.
:raises FelderaAPIError: If querying a non materialized table or view.
:raises FelderaAPIError: If the query is invalid.
"""

table = self.query_pyarrow(query)
columns = [col.to_pylist() for col in table.itercolumns()]
return [list(zip(table.schema.names, row)) for row in zip(*columns)]

def query_parquet(self, query: str, path: str):
"""
Executes an ad-hoc SQL query on this pipeline and saves the result to the specified path as a parquet file.
Expand Down
54 changes: 50 additions & 4 deletions python/feldera/rest/feldera_client.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import pathlib
from typing import Any, Dict, Optional
import logging
import warnings
import time
import json
from decimal import Decimal
from typing import Generator

import pyarrow

from feldera.rest.config import Config
from feldera.rest.feldera_config import FelderaConfig
from feldera.rest.errors import FelderaTimeoutError
Expand Down Expand Up @@ -340,7 +343,9 @@ def shutdown_pipeline(self, pipeline_name: str, timeout_s: Optional[float] = 300
time.sleep(0.1)

raise FelderaTimeoutError(
f"timeout error: pipeline '{pipeline_name}' did not shutdown in {timeout_s} seconds"
f"timeout error: pipeline '{pipeline_name}' did not shutdown in {
timeout_s
} seconds"
)

def suspend_pipeline(self, pipeline_name: str, timeout_s: Optional[float] = 300):
Expand Down Expand Up @@ -379,7 +384,9 @@ def suspend_pipeline(self, pipeline_name: str, timeout_s: Optional[float] = 300)
time.sleep(0.1)

raise FelderaTimeoutError(
f"timeout error: pipeline '{pipeline_name}' did not suspend in {timeout_s} seconds"
f"timeout error: pipeline '{pipeline_name}' did not suspend in {
timeout_s
} seconds"
)

def checkpoint_pipeline(self, pipeline_name: str) -> int:
Expand Down Expand Up @@ -531,6 +538,7 @@ def listen_to_pipeline(
pipeline_name: str,
table_name: str,
format: str,
case_sensitive: bool = False,
backpressure: bool = True,
array: bool = False,
timeout: Optional[float] = None,
Expand Down Expand Up @@ -559,6 +567,9 @@ def listen_to_pipeline(
if format == "json":
params["array"] = _prepare_boolean_input(array)

if case_sensitive:
table_name = f'"{table_name}"'

resp = self.http.post(
path=f"/pipelines/{pipeline_name}/egress/{table_name}",
params=params,
Expand Down Expand Up @@ -650,6 +661,13 @@ def query_as_json(
:param query: The SQL query to be executed.
:return: A generator that yields each row of the result as a Python dictionary, deserialized from JSON.
"""

warnings.warn(
"function query_as_json is deprecated and will be removed in a future version",
category=DeprecationWarning,
stacklevel=2,
)

params = {
"pipeline_name": pipeline_name,
"sql": query,
Expand All @@ -666,6 +684,30 @@ def query_as_json(
if chunk:
yield json.loads(chunk, parse_float=Decimal)

def query_as_pyarrow(self, pipeline_name: str, query: str) -> pyarrow.Table:
"""
Executes an ad-hoc query on the specified pipeline and returns a pyarrow
Table consisting of all the data.

:param pipeline_name: The name of the pipeline to query.
:param query: The SQL query to be executed.
:return: A pyarrow.Table consisting of all the records.
"""
params = {
"pipeline_name": pipeline_name,
"sql": query,
"format": "arrow_ipc",
}

resp = self.http.get(
path=f"/pipelines/{pipeline_name}/query",
params=params,
stream=True,
)

with pyarrow.ipc.RecordBatchStreamReader(resp.raw) as reader:
return reader.read_all()

def pause_connector(self, pipeline_name, table_name, connector_name):
"""
Pause the specified input connector.
Expand All @@ -685,7 +727,9 @@ def pause_connector(self, pipeline_name, table_name, connector_name):
"""

self.http.post(
path=f"/pipelines/{pipeline_name}/tables/{table_name}/connectors/{connector_name}/pause",
path=f"/pipelines/{pipeline_name}/tables/{table_name}/connectors/{
connector_name
}/pause",
)

def resume_connector(
Expand All @@ -709,7 +753,9 @@ def resume_connector(
"""

self.http.post(
path=f"/pipelines/{pipeline_name}/tables/{table_name}/connectors/{connector_name}/start",
path=f"/pipelines/{pipeline_name}/tables/{table_name}/connectors/{
connector_name
}/start",
)

def get_config(self) -> FelderaConfig:
Expand Down
1 change: 1 addition & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ dependencies = [
"numpy>=2.2.4",
"pretty-errors",
"ruff>=0.6.9",
"pyarrow>=20.0.0",
]
[project.urls]
Homepage = "https://www.feldera.com"
Expand Down
28 changes: 28 additions & 0 deletions python/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import uuid
import threading

from decimal import Decimal
from tests import TEST_CLIENT

from feldera.rest.pipeline import Pipeline
Expand Down Expand Up @@ -236,6 +237,33 @@ def test_adhoc_query_json(self):
TEST_CLIENT.shutdown_pipeline(name)
TEST_CLIENT.delete_pipeline(name)

def test_adhoc_query_pyarrow(self):
data = "1\n2\n"
name = str(uuid.uuid4())

sql = """
CREATE TABLE tbl(id INT) with ('materialized' = 'true');
CREATE MATERIALIZED VIEW v0 AS SELECT id, '3.14'::DECIMAL(5, 2) as d, '3.14159'::DOUBLE as dbl, MAP[1, 10] as m FROM tbl;
"""

pipeline = Pipeline(name, sql, "", "", {}, {})
pipeline = TEST_CLIENT.create_pipeline(pipeline)

TEST_CLIENT.start_pipeline(name)

TEST_CLIENT.push_to_pipeline(name, "tbl", "csv", data)
resp = TEST_CLIENT.query_as_pyarrow(pipeline.name, "select * from v0")
expected = [
{"id": 2, "d": Decimal("3.14"), "dbl": 3.14159, "m": [(1, 10)]},
{"id": 1, "d": Decimal("3.14"), "dbl": 3.14159, "m": [(1, 10)]},
]
got = resp.to_pylist()

self.assertCountEqual(got, expected)

TEST_CLIENT.shutdown_pipeline(name)
TEST_CLIENT.delete_pipeline(name)


if __name__ == "__main__":
unittest.main()
Loading