feldera · abhizer · Jun 3, 2024 · May 31, 2024 · Jun 2, 2024 · Jun 3, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - [Python] Added support for Kafka connector via Python SDK
   ([#1807](https://github.com/feldera/feldera/pull/1807))
+- [Python] Added support for HTTP GET connector via Python SDK
 
 ### Added
 

diff --git a/python/.gitignore b/python/.gitignore
@@ -10,3 +10,4 @@ docs/feldera.rst
 docs/feldera.rest.rst
 build
 feldera.egg-info
+UNKNOWN.egg-info
diff --git a/python/docs/examples.rst b/python/docs/examples.rst
@@ -72,7 +72,9 @@ Kafka
 To setup Kafka as the source use :meth:`.SQLContext.connect_source_kafka` and as the sink use
 :meth:`.SQLContext.connect_sink_kafka`.
 
-Both of these methods require a ``config`` which is a key-value pair.
+Both of these methods require a ``config`` which is a dictionary, and ``fmt`` which is a
+`data format configuration <https://www.feldera.com/docs/api/json>`_ that is either a
+:class:`.JSONFormat` or :class:`.CSVFormat`.
 
 The input config looks like the following:
 
@@ -119,7 +121,7 @@ More on Kafka as the output connector at: https://www.feldera.com/docs/connector
 .. code-block:: python
 
     from feldera import SQLContext, SQLSchema
-    from feldera.formats import JSONFormat, UpdateFormat
+    from feldera.formats import JSONFormat, JSONUpdateFormat
 
     TABLE_NAME = "example"
     VIEW_NAME = "example_count"
@@ -141,13 +143,56 @@ More on Kafka as the output connector at: https://www.feldera.com/docs/connector
         "auto.offset.reset": "earliest",
     }
 
-    kafka_format = JSONFormat().with_update_format(UpdateFormat.InsertDelete).with_array(False)
+    # Data format configuration
+    format = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)
 
-    sql.connect_source_kafka(TABLE_NAME, "kafka_conn_in", source_config, kafka_format)
-    sql.connect_sink_kafka(VIEW_NAME, "kafka_conn_out", sink_config, kafka_format)
+    sql.connect_source_kafka(TABLE_NAME, "kafka_conn_in", source_config, format)
+    sql.connect_sink_kafka(VIEW_NAME, "kafka_conn_out", sink_config, format)
 
     out = sql.listen(VIEW_NAME)
     sql.start()
     time.sleep(10)
     sql.shutdown()
     df = out.to_pandas()
+
+
+HTTP GET
+*********
+
+
+Feldera can ingest data from a user-provided URL into a SQL table.
+The file is fetched using HTTP with the GET method.
+
+More on the HTTP GET connector at: https://www.feldera.com/docs/connectors/sources/http-get
+
+.. note::
+    The JSON used as input for Feldera should be in
+    `newline-delimited JSON (NDJSON) format <https://www.feldera.com/docs/api/json/#encoding-multiple-changes>`_.
+
+
+.. highlight:: python
+.. code-block:: python
+
+    from feldera import SQLContext, SQLSchema
+    from feldera.formats import JSONFormat, JSONUpdateFormat
+
+    sql = SQLContext("test_http_get", TEST_CLIENT).get_or_create()
+
+    TBL_NAME = "items"
+    VIEW_NAME = "s"
+
+    sql.register_table(TBL_NAME, SQLSchema({"id": "INT", "name": "STRING"}))
+
+    sql.register_view(VIEW_NAME, f"SELECT * FROM {TBL_NAME}")
+
+    path = "https://feldera-basics-tutorial.s3.amazonaws.com/part.json"
+
+    fmt = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)
+    sql.connect_source_url(TBL_NAME, "part", path, fmt)
+
+    out = sql.listen(VIEW_NAME)
+
+    sql.run_to_completion()
+
+    df = out.to_pandas()
+
diff --git a/python/feldera/_helpers.py b/python/feldera/_helpers.py
@@ -1,4 +1,5 @@
 import pandas as pd
+from feldera.formats import JSONFormat, CSVFormat
 
 
 def dataframe_from_response(buffer: list[list[dict]]):
@@ -9,3 +10,11 @@ def dataframe_from_response(buffer: list[list[dict]]):
         {**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
         for sublist in buffer for item in sublist
     ])
+
+
+def validate_connector_input_format(fmt: JSONFormat | CSVFormat):
+    if not isinstance(fmt, JSONFormat) and not isinstance(fmt, CSVFormat):
+        raise ValueError("format must be JSONFormat or CSVFormat")
+
+    if isinstance(fmt, JSONFormat) and fmt.config.get("update_format") is None:
+        raise ValueError("update_format not set in the format config; consider using: .with_update_format()")
diff --git a/python/feldera/formats.py b/python/feldera/formats.py
@@ -3,35 +3,45 @@
 from enum import Enum
 
 
-class UpdateFormat(Enum):
+class JSONUpdateFormat(Enum):
     """
-    Enum for the update format of the JSON format.
+    Supported JSON data change event formats.
+
+    Each element in a JSON-formatted input stream specifies
+    an update to one or more records in an input table.  We support
+    several different ways to represent such updates.
 
     https://www.feldera.com/docs/api/json/#the-insertdelete-format
     """
 
     InsertDelete = 1
     """
-    This is a format used to represent changes in the data.
+    Insert/delete format.
+
+    Each element in the input stream consists of an "insert" or "delete"
+    command and a record to be inserted to or deleted from the input table.
 
     Example: `{"insert": {"id": 1, "name": "Alice"}, "delete": {"id": 2, "name": "Bob"}}`
     Here, `id` and `name` are the columns in the table.
     """
 
     Raw = 2
     """
-    This format represents an individual row in a SQL table or view.
-    Equivalent to `insert` in the `InsertDelete` format.
+    Raw input format.
+
+    This format is suitable for insert-only streams (no deletions).
+    Each element in the input stream contains a record without any
+    additional envelope that gets inserted in the input table.
 
     Example: `{"id": 1, "name": "Alice"}`
     Here, `id` and `name` are the columns in the table.
     """
 
     def __str__(self):
         match self:
-            case UpdateFormat.InsertDelete:
+            case JSONUpdateFormat.InsertDelete:
                 return "insert_delete"
-            case UpdateFormat.Raw:
+            case JSONUpdateFormat.Raw:
                 return "raw"
 
 
@@ -41,15 +51,31 @@ class JSONFormat:
     """
 
     def __init__(self, config: Optional[dict] = None):
+        """
+        Creates a new JSONFormat instance.
+
+        :param config: Optional. Configuration for the JSON format.
+        """
+
         self.config: dict = config or {
             "array": False,
         }
 
-    def with_update_format(self, update_format: UpdateFormat) -> Self:
+    def with_update_format(self, update_format: JSONUpdateFormat) -> Self:
+        """
+        Specifies the format of the data change events in the JSON data stream.
+        """
+
         self.config["update_format"] = update_format.__str__()
         return self
 
     def with_array(self, array: bool) -> Self:
+        """
+        Set to `True` if updates in this stream are packaged into JSON arrays.
+
+        Example: `[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]`
+        """
+
         self.config["array"] = array
         return self
 

diff --git a/python/feldera/sql_context.py b/python/feldera/sql_context.py
@@ -17,6 +17,7 @@
 from feldera.output_handler import OutputHandler
 from feldera._callback_runner import CallbackRunner, _CallbackRunnerInstruction
 from feldera.formats import JSONFormat, CSVFormat
+from feldera._helpers import validate_connector_input_format
 from enum import Enum
 
 
@@ -407,10 +408,7 @@ def connect_source_kafka(
         if config.get("topics") is None:
             raise ValueError("topics is required in the config")
 
-        fmt = fmt.to_dict()
-
-        if fmt.get("config").get("update_format") is None:
-            raise ValueError("update_format not set in the format config; consider using: .with_update_format()")
+        validate_connector_input_format(fmt)
 
         connector = Connector(
             name=connector_name,
@@ -419,7 +417,7 @@ def connect_source_kafka(
                     "name": "kafka_input",
                     "config": config,
                 },
-                "format": fmt,
+                "format": fmt.to_dict(),
             }
         )
 
@@ -428,7 +426,13 @@ def connect_source_kafka(
         else:
             self.input_connectors_buffer[table_name] = [connector]
 
-    def connect_sink_kafka(self, view_name: str, connector_name: str, config: dict, fmt: JSONFormat | CSVFormat):
+    def connect_sink_kafka(
+        self,
+        view_name: str,
+        connector_name: str,
+        config: dict,
+        fmt: JSONFormat | CSVFormat
+    ):
         """
         Associates the specified kafka topic on the specified Kafka server as output sink for the specified view in
         Feldera. The topic is populated with changes in the specified view.
@@ -445,10 +449,7 @@ def connect_sink_kafka(self, view_name: str, connector_name: str, config: dict,
         if config.get("topic") is None:
             raise ValueError("topic is required in the config")
 
-        fmt = fmt.to_dict()
-
-        if fmt.get("config").get("update_format") is None:
-            raise ValueError("update_format not set in the format config; consider using: .with_update_format()")
+        validate_connector_input_format(fmt)
 
         connector = Connector(
             name=connector_name,
@@ -457,7 +458,7 @@ def connect_sink_kafka(self, view_name: str, connector_name: str, config: dict,
                     "name": "kafka_output",
                     "config": config,
                 },
-                "format": fmt,
+                "format": fmt.to_dict(),
             }
         )
 
@@ -466,6 +467,43 @@ def connect_sink_kafka(self, view_name: str, connector_name: str, config: dict,
         else:
             self.output_connectors_buffer[view_name] = [connector]
 
+    def connect_source_url(
+        self,
+        table_name: str,
+        connector_name: str,
+        path: str,
+        fmt: JSONFormat | CSVFormat
+    ):
+        """
+        Associates the specified URL as input source for the specified table in Feldera.
+        Feldera will make a GET request to the specified URL to read the data and populate the table.
+
+        :param table_name: The name of the table.
+        :param connector_name: The unique name for this connector.
+        :param path: The URL to read the data from.
+        :param fmt: The format of the data in the URL.
+        """
+
+        validate_connector_input_format(fmt)
+
+        connector = Connector(
+            name=connector_name,
+            config={
+                "transport": {
+                    "name": "url_input",
+                    "config": {
+                        "path": path
+                    }
+                },
+                "format": fmt.to_dict(),
+            }
+        )
+
+        if table_name in self.input_connectors_buffer:
+            self.input_connectors_buffer[table_name].append(connector)
+        else:
+            self.input_connectors_buffer[table_name] = [connector]
+
     def run_to_completion(self):
         """
         .. _run_to_completion:

diff --git a/python/tests/test_wireframes.py b/python/tests/test_wireframes.py
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from feldera import SQLContext, SQLSchema
-from feldera.formats import JSONFormat, UpdateFormat
+from feldera.formats import JSONFormat, JSONUpdateFormat
 from tests import TEST_CLIENT
 
 
@@ -160,7 +160,7 @@ def test_kafka(self):
             "auto.offset.reset": "earliest",
         }
 
-        kafka_format = JSONFormat().with_update_format(UpdateFormat.InsertDelete).with_array(False)
+        kafka_format = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)
 
         sink_config = {
             "topic": OUTPUT_TOPIC,
@@ -178,6 +178,29 @@ def test_kafka(self):
         df = out.to_pandas()
         assert df.shape[0] != 0
 
+    def test_http_get(self):
+        sql = SQLContext("test_http_get", TEST_CLIENT).get_or_create()
+
+        TBL_NAME = "items"
+        VIEW_NAME = "s"
+
+        sql.register_table(TBL_NAME, SQLSchema({"id": "INT", "name": "STRING"}))
+
+        sql.register_view(VIEW_NAME, f"SELECT * FROM {TBL_NAME}")
+
+        path = "https://feldera-basics-tutorial.s3.amazonaws.com/part.json"
+
+        fmt = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)
+        sql.connect_source_url(TBL_NAME, "part", path, fmt)
+
+        out = sql.listen(VIEW_NAME)
+
+        sql.run_to_completion()
+
+        df = out.to_pandas()
+
+        assert df.shape[0] == 3
+
 
 if __name__ == '__main__':
     unittest.main()