feldera · ryzhyk · Jun 24, 2024 · Jun 14, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/crates/pipeline_manager/src/db/mod.rs b/crates/pipeline_manager/src/db/mod.rs
@@ -1432,7 +1432,7 @@ impl ProjectDB {
             .get_migrations()
             .iter()
             .map(|m| m.version())
-            .fold(std::u32::MIN, |a, b| a.max(b));
+            .fold(u32::MIN, |a, b| a.max(b));
         let migration = runner.get_last_applied_migration_async(&mut **client).await;
         if let Ok(Some(m)) = migration {
             let v = m.version();

diff --git a/demo/project_demo10-FraudDetectionDeltaLake/notebook.ipynb b/demo/project_demo10-FraudDetectionDeltaLake/notebook.ipynb
@@ -333,7 +333,8 @@
     "hfeature = sql.listen(\"feature\")\n",
     "\n",
     "# Process full snapshot of the input tables and compute a dataset with feature vectors.\n",
-    "sql.run_to_completion()\n",
+    "sql.start()\n",
+    "sql.wait_for_completion(shutdown=True)\n",
     "\n",
     "# Read computed feature vectors into a Pandas dataframe.\n",
     "features_pd = hfeature.to_pandas()\n",

diff --git a/demo/project_demo10-FraudDetectionDeltaLake/run.py b/demo/project_demo10-FraudDetectionDeltaLake/run.py
@@ -121,7 +121,8 @@ def main():
 
     # Process full snapshot of the input tables and compute a dataset
     # with feature vectors for use in model training and testing.
-    sql.run_to_completion()
+    sql.start()
+    sql.wait_for_completion(shutdown=True)
 
     features_pd = hfeature.to_pandas()
     print(f"Computed {len(features_pd)} feature vectors")

diff --git a/docs/use_cases/fraud_detection/fraud_detection.md b/docs/use_cases/fraud_detection/fraud_detection.md
@@ -298,7 +298,8 @@ hfeature = sql.listen("feature")
 
 # Process full snapshot of the input tables and compute a dataset
 # with feature vectors for use in model training and testing.
-sql.run_to_completion()
+sql.start()
+sql.wait_for_completion(shutdown=True)
 
 features_pd = hfeature.to_pandas()
 print(f"Computed {len(features_pd)} feature vectors")

diff --git a/python/docs/examples.rst b/python/docs/examples.rst
@@ -1,17 +1,17 @@
 Examples
 ========
 
-Pandas
-*******
+Using Pandas DataFrames as Input / Output
+*******************************************
 
 
-Working wth pandas DataFrames in Feldera is fairly straight forward. 
-You can use :meth:`.SQLContext.connect_source_pandas` to connect a 
+You can use :meth:`.SQLContext.input_pandas` to connect a
 DataFrame to a feldera table as the data source. 
 
-To listen for response from feldera, in the form of DataFrames, it is necessary
-to to call :meth:`.SQLContext.listen` before you call 
-:meth:`.SQLContext.start` or :meth:`.SQLContext.run_to_completion`.
+To listen for response from feldera, in the form of DataFrames
+call :meth:`.SQLContext.listen`.
+To ensure all data is received start listening before calling
+:meth:`.SQLContext.start`.
 
 .. highlight:: python
 .. code-block:: python
@@ -47,17 +47,20 @@ to to call :meth:`.SQLContext.listen` before you call
     query = f"SELECT name, ((science + maths + art) / 3) as average FROM {TBL_NAMES[0]} JOIN {TBL_NAMES[1]} on id = student_id ORDER BY average DESC"
     sql.register_output_view(view_name, query)
 
-    # connect the source (a pandas Dataframe in this case) to the tables
-    sql.connect_source_pandas(TBL_NAMES[0], df_students)
-    sql.connect_source_pandas(TBL_NAMES[1], df_grades)
-
     # listen for the output of the view here in the notebook
     # you do not need to call this if you are forwarding the data to a sink
     out = sql.listen(view_name)
 
-    # run this to completion
+    # start the pipeline
+    sql.start()
+
+    # connect the source (a pandas Dataframe in this case) to the tables
+    sql.input_pandas(TBL_NAMES[0], df_students)
+    sql.input_pandas(TBL_NAMES[1], df_grades)
+
+    # wait for the pipeline to complete
     # note that if the source is a stream, this will run indefinitely
-    sql.run_to_completion()
+    sql.wait_for_completion(shutdown=True)
 
     # finally, convert the output to a pandas Dataframe
     df = out.to_pandas()
@@ -66,8 +69,8 @@ to to call :meth:`.SQLContext.listen` before you call
     print(df)
 
 
-Kafka
-******
+Using Kafka as Data Source / Sink
+***********************************
 
 To setup Kafka as the source use :meth:`.SQLContext.connect_source_kafka` and as the sink use
 :meth:`.SQLContext.connect_sink_kafka`.
@@ -115,7 +118,7 @@ Here the only notable difference is:
 More on Kafka as the output connector at: https://www.feldera.com/docs/connectors/sinks/kafka
 
 .. warning::
-    Kafka is a streaming data source, therefore running: :meth:`.SQLContext.run_to_completion` will run forever.
+    Kafka is a streaming data source, therefore running: :meth:`.SQLContext.wait_for_completion` will block forever.
 
 .. highlight:: python
 .. code-block:: python
@@ -156,8 +159,8 @@ More on Kafka as the output connector at: https://www.feldera.com/docs/connector
     df = out.to_pandas()
 
 
-HTTP GET
-*********
+Ingesting data from a URL
+**************************
 
 
 Feldera can ingest data from a user-provided URL into a SQL table.
@@ -192,7 +195,8 @@ More on the HTTP GET connector at: https://www.feldera.com/docs/connectors/sourc
 
     out = sql.listen(VIEW_NAME)
 
-    sql.run_to_completion()
+    sql.start()
+    sql.wait_for_completion(shutdown=True)
 
     df = out.to_pandas()
 
diff --git a/python/docs/introduction.rst b/python/docs/introduction.rst
@@ -64,10 +64,12 @@ Key Concepts
         the name used in both Feldera Program and Pipeline.
       - The second parameter here is :class:`.FelderaClient` that we created above.
 
-* :meth:`.SQLContext.run_to_completion`
-   - Runs this Feldera pipeline to completion. Normally this means until the EoF
+* :meth:`.SQLContext.wait_for_completion`
+   - Blocks this Feldera pipeline until completion. Normally this means until the end-of-file (EOF)
      has been reached for this input source.
 
+   - Takes a parameter ``shutdown``, when set shuts the pipeline down after completion.
+
    - Example:
 
       .. code-block:: python
@@ -100,7 +102,7 @@ Key Concepts
 
          sql.connect_sink_delta_table(view_name, out_con, out_cfg)
 
-         sql.run_to_completion()
+         sql.wait_for_completion(shutdown=True)
 
       - Here, we register a data table which receives data from input sources.
       - Then, we register a view that performs operations on this input data. 

diff --git a/python/feldera/_helpers.py b/python/feldera/_helpers.py
@@ -40,4 +40,4 @@ def chunk_dataframe(df, chunk_size=1000):
     """
 
     for i in range(0, len(df), chunk_size):
-        yield df.iloc[i:i + chunk_size]
+        yield df.iloc[i:i + chunk_size]
diff --git a/python/feldera/enums.py b/python/feldera/enums.py
@@ -31,3 +31,153 @@ class BuildMode(Enum):
     CREATE = 1
     GET = 2
     GET_OR_CREATE = 3
+
+
+class PipelineStatus(Enum):
+    """
+    Represents the state that this pipeline is currently in.
+
+    .. code-block:: text
+
+        Shutdown     ◄────┐
+        │         │
+        /deploy   │       │
+        │   ⌛ShuttingDown
+        ▼         ▲
+        ⌛Provisioning    │
+        │         │
+        Provisioned        │
+        ▼         │/shutdown
+        ⌛Initializing     │
+        │        │
+        ┌────────┴─────────┴─┐
+        │        ▼           │
+        │      Paused        │
+        │      │    ▲        │
+        │/start│    │/pause  │
+        │      ▼    │        │
+        │     Running        │
+        └──────────┬─────────┘
+                   │
+                   ▼
+                Failed
+   """
+
+    NOT_FOUND = 1
+    """
+    The pipeline has not been created yet.
+    """
+
+    SHUTDOWN = 2
+    """
+    Pipeline has not been started or has been shut down.
+
+    The pipeline remains in this state until the user triggers
+    a deployment by invoking the `/deploy` endpoint.
+    """
+
+    PROVISIONING = 3
+    """
+    The runner triggered a deployment of the pipeline and is
+    waiting for the pipeline HTTP server to come up.
+
+    In this state, the runner provisions a runtime for the pipeline,
+    starts the pipeline within this runtime and waits for it to start accepting HTTP requests.
+
+    The user is unable to communicate with the pipeline during this
+    time.  The pipeline remains in this state until:
+
+        1. Its HTTP server is up and running; the pipeline transitions to the
+           `PipelineStatus.INITIALIZING` state.
+        2. A pre-defined timeout has passed.  The runner performs forced
+           shutdown of the pipeline; returns to the `PipelineStatus.SHUTDOWN` state.
+        3. The user cancels the pipeline by invoking the `/shutdown` endpoint.
+           The manager performs forced shutdown of the pipeline, returns to the
+           `PipelineStatus.SHUTDOWN` state.
+
+    """
+
+    INITIALIZING = 4
+    """
+    The pipeline is initializing its internal state and connectors.
+
+    This state is part of the pipeline's deployment process.  In this state,
+    the pipeline's HTTP server is up and running, but its query engine
+    and input and output connectors are still initializing.
+
+    The pipeline remains in this state until:
+
+        1.  Initialization completes successfully; the pipeline transitions to the
+            `PipelineStatus.PAUSED` state.
+        2.  Initialization fails; transitions to the `PipelineStatus.FAILED` state.
+        3.  A pre-defined timeout has passed.  The runner performs forced 
+            shutdown of the pipeline; returns to the `PipelineStatus.SHUTDOWN` state.
+        4.  The user cancels the pipeline by invoking the `/shutdown` endpoint. 
+            The manager performs forced shutdown of the pipeline; returns to the 
+            `PipelineStatus.SHUTDOWN` state.
+
+    """
+
+    PAUSED = 5
+    """
+    The pipeline is fully initialized, but data processing has been paused.
+
+    The pipeline remains in this state until:
+
+        1.  The user starts the pipeline by invoking the `/start` endpoint. The
+            manager passes the request to the pipeline; transitions to the
+            `PipelineStatus.RUNNING` state.
+        2.  The user cancels the pipeline by invoking the `/shutdown` endpoint.
+            The manager passes the shutdown request to the pipeline to perform a
+            graceful shutdown; transitions to the `PipelineStatus.SHUTTING_DOWN` state.
+        3.  An unexpected runtime error renders the pipeline `PipelineStatus.FAILED`.
+
+    """
+
+    RUNNING = 6
+    """
+    The pipeline is processing data.
+
+    The pipeline remains in this state until:
+
+        1. The user pauses the pipeline by invoking the `/pause` endpoint. The
+           manager passes the request to the pipeline; transitions to the
+           `PipelineStatus.PAUSED` state.
+        2. The user cancels the pipeline by invoking the `/shutdown` endpoint.
+           The runner passes the shutdown request to the pipeline to perform a
+           graceful shutdown; transitions to the
+           `PipelineStatus.SHUTTING_DOWN` state.
+        3. An unexpected runtime error renders the pipeline
+           `PipelineStatus.FAILED`.
+
+    """
+
+    SHUTTING_DOWN = 7
+    """
+    Graceful shutdown in progress.
+
+    In this state, the pipeline finishes any ongoing data processing,
+    produces final outputs, shuts down input/output connectors and
+    terminates.
+
+    The pipeline remains in this state until:
+
+        1. Shutdown completes successfully; transitions to the `PipelineStatus.SHUTDOWN` state.
+        2. A pre-defined timeout has passed. The manager performs forced shutdown of the pipeline; returns to the
+           `PipelineStatus.SHUTDOWN` state.
+
+    """
+
+    FAILED = 8
+    """
+    The pipeline remains in this state until the users acknowledge the failure
+    by issuing a call to shutdown the pipeline; transitions to the
+    `PipelineStatus.SHUTDOWN` state.
+    """
+
+    @staticmethod
+    def from_str(value):
+        for member in PipelineStatus:
+            if member.name.lower() == value.lower():
+                return member
+        raise ValueError(f"Unknown value '{value}' for enum {PipelineStatus.__name__}")
diff --git a/python/feldera/formats.py b/python/feldera/formats.py
@@ -169,7 +169,7 @@ def __init__(
             self,
             config: Optional[dict] = None,
             schema: Optional[str] = None,
-            skip_schema_id: Optional[bool] = None,
+            skip_schema_id: Optional[bool] = False,
             registry_urls: Optional[list[str]] = None,
             registry_headers: Optional[Mapping[str, str]] = None,
             registry_proxy: Optional[str] = None,

diff --git a/python/feldera/output_handler.py b/python/feldera/output_handler.py
@@ -1,12 +1,13 @@
 import pandas as pd
+from typing import Optional
 
 from queue import Queue
 from feldera import FelderaClient
 from feldera._callback_runner import CallbackRunner
 
 
 class OutputHandler:
-    def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, queue: Queue):
+    def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, queue: Optional[Queue]):
         """
         Initializes the output handler, but doesn't start it.
         To start the output handler, call the `.OutputHandler.start` method.
@@ -15,7 +16,7 @@ def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, qu
         self.client: FelderaClient = client
         self.pipeline_name: str = pipeline_name
         self.view_name: str = view_name
-        self.queue: Queue = queue
+        self.queue: Optional[Queue] = queue
         self.buffer: list[pd.DataFrame] = []
 
         # the callback that is passed to the `CallbackRunner`
@@ -33,13 +34,17 @@ def start(self):
 
         self.handler.start()
 
-    def to_pandas(self):
+    def to_pandas(self, clear_buffer: bool = True):
         """
         Returns the output of the pipeline as a pandas DataFrame
-        """
 
-        self.handler.join()
+        :param clear_buffer: Whether to clear the buffer after getting the output.
+        """
 
         if len(self.buffer) == 0:
             return pd.DataFrame()
-        return pd.concat(self.buffer, ignore_index=True)
+        res = pd.concat(self.buffer, ignore_index=True)
+        if clear_buffer:
+            self.buffer.clear()
+
+        return res