Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions python/feldera/_callback_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from enum import Enum
from threading import Thread
from typing import Callable, Optional
from queue import Queue, Empty

import pandas as pd
from feldera import FelderaClient
from feldera._helpers import dataframe_from_response


class _CallbackRunnerInstruction(Enum):
PipelineStarted = 1
RanToCompletion = 2


class CallbackRunner(Thread):
def __init__(
self,
client: FelderaClient,
pipeline_name: str,
view_name: str,
callback: Callable[[pd.DataFrame, int], None],
queue: Optional[Queue],
):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is almost identical to output_handler.rs. I understand that that code assembles the result in a single dataframe, while here we invoke the callback for each batch. But I would expect the former to be implemented on top of the latter, so we shouldn't need two near-identical implementations.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I am working on finding an elegant way to merge the two.

super().__init__()
self.client: FelderaClient = client
self.pipeline_name: str = pipeline_name
self.view_name: str = view_name
self.callback: Callable[[pd.DataFrame, int], None] = callback
self.queue: Optional[Queue] = queue

def run(self):
"""
The main loop of the thread. Listens for data and calls the callback function on each chunk of data received.

:meta private:
"""

# by default, we assume that the pipeline has been started
ack: _CallbackRunnerInstruction = _CallbackRunnerInstruction.PipelineStarted

# if there is Queue, we wait for the instruction to start the pipeline
# this means that we are listening to the pipeline before running it, therefore, all data should be received
if self.queue:
ack: _CallbackRunnerInstruction = self.queue.get()

match ack:

# if the pipeline has actually been started, we start a listener
case _CallbackRunnerInstruction.PipelineStarted:

# listen to the pipeline
gen_obj = self.client.listen_to_pipeline(self.pipeline_name, self.view_name, format="json")

# if there is a queue set up, inform the main thread that the listener has been started, and it can
# proceed with starting the pipeline
if self.queue:
# stop blocking the main thread on `join` for the previous message
self.queue.task_done()

for chunk in gen_obj:
chunk: dict = chunk
data: list[dict] = chunk.get("json_data")
seq_no: int = chunk.get("sequence_number")

if data is not None:
self.callback(dataframe_from_response([data]), seq_no)

if self.queue:
try:
# if a non-blocking way, check if the queue has received further instructions
# this should be a RanToCompletion instruction, which means that the pipeline has been
# completed
again_ack = self.queue.get_nowait()

# if the queue has received a message
if again_ack:

match again_ack:
case _CallbackRunnerInstruction.RanToCompletion:
# stop blocking the main thread on `join` and return from this thread
self.queue.task_done()

return

case _CallbackRunnerInstruction.PipelineStarted:
# if the pipeline has been started again, which shouldn't happen,
# ignore it and continue listening, call `task_done` to avoid blocking the main
# thread on `join`
self.queue.task_done()

continue
except Empty:
# if the queue is empty, continue listening
continue

case _CallbackRunnerInstruction.RanToCompletion:
if self.queue:
self.queue.task_done()
return
11 changes: 11 additions & 0 deletions python/feldera/_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pandas as pd


def dataframe_from_response(buffer: list[list[dict]]):
"""
Converts the response from Feldera to a pandas DataFrame.
"""
return pd.DataFrame([
{**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
for sublist in buffer for item in sublist
])
77 changes: 23 additions & 54 deletions python/feldera/output_handler.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,41 @@
import pandas as pd


from threading import Thread
from queue import Queue, Empty
from queue import Queue
from feldera import FelderaClient
from enum import Enum

from feldera._callback_runner import CallbackRunner

class _OutputHandlerInstruction(Enum):
PipelineStarted = 1
RanToCompletion = 2


class OutputHandler(Thread):
class OutputHandler:
def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, queue: Queue):
super().__init__()
"""
Initializes the output handler, but doesn't start it.
To start the output handler, call the `.OutputHandler.start` method.
"""

self.client: FelderaClient = client
self.pipeline_name: str = pipeline_name
self.view_name: str = view_name
self.queue: Queue = queue
self.buffer: list[list[dict]] = []

def run(self):
"""
The main loop of the thread. It listens to the pipeline and appends the data to the buffer.
Doesn't do integration, just takes the data and ignores if they are `insert`s or `delete`s.
self.buffer: list[pd.DataFrame] = []

:meta private:
"""

ack: _OutputHandlerInstruction = self.queue.get()

match ack:
case _OutputHandlerInstruction.PipelineStarted:
gen_obj = self.client.listen_to_pipeline(self.pipeline_name, self.view_name, format="json")
self.queue.task_done()

for chunk in gen_obj:
chunk: dict = chunk
data: list[dict] = chunk.get("json_data")
# the callback that is passed to the `CallbackRunner`
def callback(df: pd.DataFrame, _: int):
self.buffer.append(df)

if data:
self.buffer.append(data)
# sets up the callback runner
self.handler = CallbackRunner(self.client, self.pipeline_name, self.view_name, callback, queue)

try:
again_ack: _OutputHandlerInstruction = self.queue.get(block=False)
if again_ack:
match again_ack:
case _OutputHandlerInstruction.RanToCompletion:
self.queue.task_done()
return
case _OutputHandlerInstruction.PipelineStarted:
self.queue.task_done()
continue

except Empty:
continue
def start(self):
"""
Starts the output handler in a separate thread
"""

case _OutputHandlerInstruction.RanToCompletion:
self.queue.task_done()
return
self.handler.start()

def to_pandas(self):
"""
Converts the output of the pipeline to a pandas DataFrame
Returns the output of the pipeline as a pandas DataFrame
"""
self.join()
return pd.DataFrame([
{**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
for sublist in self.buffer for item in sublist
])

self.handler.join()
return pd.concat(self.buffer)
15 changes: 4 additions & 11 deletions python/feldera/rest/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,17 +485,10 @@ def listen_to_pipeline(

end = time.time() + timeout if timeout else None

old_chunk = b""

for chunk in resp.iter_content(chunk_size=None):
# Using the default chunk size below makes `iter_lines` extremely
# inefficient when dealing with long lines.
for chunk in resp.iter_lines(chunk_size=50000000):
if end and time.time() > end:
break
if chunk:
try:
chunk = old_chunk + chunk
valid_json = json.loads(chunk)
old_chunk = b""
yield valid_json
except json.decoder.JSONDecodeError:
old_chunk += chunk
continue
yield json.loads(chunk)
Loading