feldera · gz · May 28, 2024 · May 27, 2024 · May 28, 2024 · May 28, 2024
diff --git a/python/feldera/_callback_runner.py b/python/feldera/_callback_runner.py
@@ -0,0 +1,100 @@
+from enum import Enum
+from threading import Thread
+from typing import Callable, Optional
+from queue import Queue, Empty
+
+import pandas as pd
+from feldera import FelderaClient
+from feldera._helpers import dataframe_from_response
+
+
+class _CallbackRunnerInstruction(Enum):
+    PipelineStarted = 1
+    RanToCompletion = 2
+
+
+class CallbackRunner(Thread):
+    def __init__(
+            self,
+            client: FelderaClient,
+            pipeline_name: str,
+            view_name: str,
+            callback: Callable[[pd.DataFrame, int], None],
+            queue: Optional[Queue],
+    ):
+        super().__init__()
+        self.client: FelderaClient = client
+        self.pipeline_name: str = pipeline_name
+        self.view_name: str = view_name
+        self.callback: Callable[[pd.DataFrame, int], None] = callback
+        self.queue: Optional[Queue] = queue
+
+    def run(self):
+        """
+        The main loop of the thread. Listens for data and calls the callback function on each chunk of data received.
+
+        :meta private:
+        """
+
+        # by default, we assume that the pipeline has been started
+        ack: _CallbackRunnerInstruction = _CallbackRunnerInstruction.PipelineStarted
+
+        # if there is Queue, we wait for the instruction to start the pipeline
+        # this means that we are listening to the pipeline before running it, therefore, all data should be received
+        if self.queue:
+            ack: _CallbackRunnerInstruction = self.queue.get()
+
+        match ack:
+
+            # if the pipeline has actually been started, we start a listener
+            case _CallbackRunnerInstruction.PipelineStarted:
+
+                # listen to the pipeline
+                gen_obj = self.client.listen_to_pipeline(self.pipeline_name, self.view_name, format="json")
+
+                # if there is a queue set up, inform the main thread that the listener has been started, and it can
+                # proceed with starting the pipeline
+                if self.queue:
+                    # stop blocking the main thread on `join` for the previous message
+                    self.queue.task_done()
+
+                for chunk in gen_obj:
+                    chunk: dict = chunk
+                    data: list[dict] = chunk.get("json_data")
+                    seq_no: int = chunk.get("sequence_number")
+
+                    if data is not None:
+                        self.callback(dataframe_from_response([data]), seq_no)
+
+                    if self.queue:
+                        try:
+                            # if a non-blocking way, check if the queue has received further instructions
+                            # this should be a RanToCompletion instruction, which means that the pipeline has been
+                            # completed
+                            again_ack = self.queue.get_nowait()
+
+                            # if the queue has received a message
+                            if again_ack:
+
+                                match again_ack:
+                                    case _CallbackRunnerInstruction.RanToCompletion:
+                                        # stop blocking the main thread on `join` and return from this thread
+                                        self.queue.task_done()
+
+                                        return
+
+                                    case _CallbackRunnerInstruction.PipelineStarted:
+                                        # if the pipeline has been started again, which shouldn't happen,
+                                        # ignore it and continue listening, call `task_done` to avoid blocking the main
+                                        # thread on `join`
+                                        self.queue.task_done()
+
+                                        continue
+                        except Empty:
+                            # if the queue is empty, continue listening
+                            continue
+
+            case _CallbackRunnerInstruction.RanToCompletion:
+                if self.queue:
+                    self.queue.task_done()
+                return
diff --git a/python/feldera/_helpers.py b/python/feldera/_helpers.py
@@ -0,0 +1,11 @@
+import pandas as pd
+
+
+def dataframe_from_response(buffer: list[list[dict]]):
+    """
+    Converts the response from Feldera to a pandas DataFrame.
+    """
+    return pd.DataFrame([
+        {**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
+        for sublist in buffer for item in sublist
+    ])
diff --git a/python/feldera/output_handler.py b/python/feldera/output_handler.py
@@ -1,72 +1,41 @@
 import pandas as pd
 
-
-from threading import Thread
-from queue import Queue, Empty
+from queue import Queue
 from feldera import FelderaClient
-from enum import Enum
-
+from feldera._callback_runner import CallbackRunner
 
-class _OutputHandlerInstruction(Enum):
-    PipelineStarted = 1
-    RanToCompletion = 2
 
-
-class OutputHandler(Thread):
+class OutputHandler:
     def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, queue: Queue):
-        super().__init__()
+        """
+        Initializes the output handler, but doesn't start it.
+        To start the output handler, call the `.OutputHandler.start` method.
+        """
+
         self.client: FelderaClient = client
         self.pipeline_name: str = pipeline_name
         self.view_name: str = view_name
         self.queue: Queue = queue
-        self.buffer: list[list[dict]] = []
-
-    def run(self):
-        """
-        The main loop of the thread. It listens to the pipeline and appends the data to the buffer.
-        Doesn't do integration, just takes the data and ignores if they are `insert`s or `delete`s.
+        self.buffer: list[pd.DataFrame] = []
 
-        :meta private:
-        """
-
-        ack: _OutputHandlerInstruction = self.queue.get()
-
-        match ack:
-            case _OutputHandlerInstruction.PipelineStarted:
-                gen_obj = self.client.listen_to_pipeline(self.pipeline_name, self.view_name, format="json")
-                self.queue.task_done()
-
-                for chunk in gen_obj:
-                    chunk: dict = chunk
-                    data: list[dict] = chunk.get("json_data")
+        # the callback that is passed to the `CallbackRunner`
+        def callback(df: pd.DataFrame, _: int):
+            self.buffer.append(df)
 
-                    if data:
-                        self.buffer.append(data)
+        # sets up the callback runner
+        self.handler = CallbackRunner(self.client, self.pipeline_name, self.view_name, callback, queue)
 
-                    try:
-                        again_ack: _OutputHandlerInstruction = self.queue.get(block=False)
-                        if again_ack:
-                            match again_ack:
-                                case _OutputHandlerInstruction.RanToCompletion:
-                                    self.queue.task_done()
-                                    return
-                                case _OutputHandlerInstruction.PipelineStarted:
-                                    self.queue.task_done()
-                                    continue
-
-                    except Empty:
-                        continue
+    def start(self):
+        """
+        Starts the output handler in a separate thread
+        """
 
-            case _OutputHandlerInstruction.RanToCompletion:
-                self.queue.task_done()
-                return
+        self.handler.start()
 
     def to_pandas(self):
         """
-        Converts the output of the pipeline to a pandas DataFrame
+        Returns the output of the pipeline as a pandas DataFrame
         """
-        self.join()
-        return pd.DataFrame([
-            {**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
-            for sublist in self.buffer for item in sublist
-        ])
+
+        self.handler.join()
+        return pd.concat(self.buffer)
diff --git a/python/feldera/rest/client.py b/python/feldera/rest/client.py
@@ -485,17 +485,10 @@ def listen_to_pipeline(
 
         end = time.time() + timeout if timeout else None
 
-        old_chunk = b""
-
-        for chunk in resp.iter_content(chunk_size=None):
+        # Using the default chunk size below makes `iter_lines` extremely
+        # inefficient when dealing with long lines.
+        for chunk in resp.iter_lines(chunk_size=50000000):
             if end and time.time() > end:
                 break
             if chunk:
-                try:
-                    chunk = old_chunk + chunk
-                    valid_json = json.loads(chunk)
-                    old_chunk = b""
-                    yield valid_json
-                except json.decoder.JSONDecodeError:
-                    old_chunk += chunk
-                    continue
+                yield json.loads(chunk)