Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- [Python] Added support for Kafka connector via Python SDK
([#1807](https://github.com/feldera/feldera/pull/1807))
- [Python] Added support for HTTP GET connector via Python SDK

### Added

Expand Down
1 change: 1 addition & 0 deletions python/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ docs/feldera.rst
docs/feldera.rest.rst
build
feldera.egg-info
UNKNOWN.egg-info
55 changes: 50 additions & 5 deletions python/docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ Kafka
To setup Kafka as the source use :meth:`.SQLContext.connect_source_kafka` and as the sink use
:meth:`.SQLContext.connect_sink_kafka`.

Both of these methods require a ``config`` which is a key-value pair.
Both of these methods require a ``config`` which is a dictionary, and ``fmt`` which is a
`data format configuration <https://www.feldera.com/docs/api/json>`_ that is either a
:class:`.JSONFormat` or :class:`.CSVFormat`.

The input config looks like the following:

Expand Down Expand Up @@ -119,7 +121,7 @@ More on Kafka as the output connector at: https://www.feldera.com/docs/connector
.. code-block:: python

from feldera import SQLContext, SQLSchema
from feldera.formats import JSONFormat, UpdateFormat
from feldera.formats import JSONFormat, JSONUpdateFormat

TABLE_NAME = "example"
VIEW_NAME = "example_count"
Expand All @@ -141,13 +143,56 @@ More on Kafka as the output connector at: https://www.feldera.com/docs/connector
"auto.offset.reset": "earliest",
}

kafka_format = JSONFormat().with_update_format(UpdateFormat.InsertDelete).with_array(False)
# Data format configuration
format = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)

sql.connect_source_kafka(TABLE_NAME, "kafka_conn_in", source_config, kafka_format)
sql.connect_sink_kafka(VIEW_NAME, "kafka_conn_out", sink_config, kafka_format)
sql.connect_source_kafka(TABLE_NAME, "kafka_conn_in", source_config, format)
sql.connect_sink_kafka(VIEW_NAME, "kafka_conn_out", sink_config, format)

out = sql.listen(VIEW_NAME)
sql.start()
time.sleep(10)
sql.shutdown()
df = out.to_pandas()


HTTP GET
*********


Feldera can ingest data from a user-provided URL into a SQL table.
The file is fetched using HTTP with the GET method.

More on the HTTP GET connector at: https://www.feldera.com/docs/connectors/sources/http-get

.. note::
The JSON used as input for Feldera should be in
`newline-delimited JSON (NDJSON) format <https://www.feldera.com/docs/api/json/#encoding-multiple-changes>`_.


.. highlight:: python
.. code-block:: python

from feldera import SQLContext, SQLSchema
from feldera.formats import JSONFormat, JSONUpdateFormat

sql = SQLContext("test_http_get", TEST_CLIENT).get_or_create()

TBL_NAME = "items"
VIEW_NAME = "s"

sql.register_table(TBL_NAME, SQLSchema({"id": "INT", "name": "STRING"}))

sql.register_view(VIEW_NAME, f"SELECT * FROM {TBL_NAME}")

path = "https://feldera-basics-tutorial.s3.amazonaws.com/part.json"

fmt = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)
sql.connect_source_url(TBL_NAME, "part", path, fmt)

out = sql.listen(VIEW_NAME)

sql.run_to_completion()

df = out.to_pandas()

9 changes: 9 additions & 0 deletions python/feldera/_helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
from feldera.formats import JSONFormat, CSVFormat


def dataframe_from_response(buffer: list[list[dict]]):
Expand All @@ -9,3 +10,11 @@ def dataframe_from_response(buffer: list[list[dict]]):
{**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
for sublist in buffer for item in sublist
])


def validate_connector_input_format(fmt: JSONFormat | CSVFormat):
if not isinstance(fmt, JSONFormat) and not isinstance(fmt, CSVFormat):
raise ValueError("format must be JSONFormat or CSVFormat")

if isinstance(fmt, JSONFormat) and fmt.config.get("update_format") is None:
raise ValueError("update_format not set in the format config; consider using: .with_update_format()")
42 changes: 34 additions & 8 deletions python/feldera/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,45 @@
from enum import Enum


class UpdateFormat(Enum):
class JSONUpdateFormat(Enum):
"""
Enum for the update format of the JSON format.
Supported JSON data change event formats.

Each element in a JSON-formatted input stream specifies
an update to one or more records in an input table. We support
several different ways to represent such updates.

https://www.feldera.com/docs/api/json/#the-insertdelete-format
"""

InsertDelete = 1
"""
This is a format used to represent changes in the data.
Insert/delete format.

Each element in the input stream consists of an "insert" or "delete"
command and a record to be inserted to or deleted from the input table.

Example: `{"insert": {"id": 1, "name": "Alice"}, "delete": {"id": 2, "name": "Bob"}}`
Here, `id` and `name` are the columns in the table.
"""

Raw = 2
"""
This format represents an individual row in a SQL table or view.
Equivalent to `insert` in the `InsertDelete` format.
Raw input format.

This format is suitable for insert-only streams (no deletions).
Each element in the input stream contains a record without any
additional envelope that gets inserted in the input table.

Example: `{"id": 1, "name": "Alice"}`
Here, `id` and `name` are the columns in the table.
"""

def __str__(self):
match self:
case UpdateFormat.InsertDelete:
case JSONUpdateFormat.InsertDelete:
return "insert_delete"
case UpdateFormat.Raw:
case JSONUpdateFormat.Raw:
return "raw"


Expand All @@ -41,15 +51,31 @@ class JSONFormat:
"""

def __init__(self, config: Optional[dict] = None):
"""
Creates a new JSONFormat instance.

:param config: Optional. Configuration for the JSON format.
"""

self.config: dict = config or {
"array": False,
}

def with_update_format(self, update_format: UpdateFormat) -> Self:
def with_update_format(self, update_format: JSONUpdateFormat) -> Self:
"""
Specifies the format of the data change events in the JSON data stream.
"""

self.config["update_format"] = update_format.__str__()
return self

def with_array(self, array: bool) -> Self:
"""
Set to `True` if updates in this stream are packaged into JSON arrays.

Example: `[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]`
"""

self.config["array"] = array
return self

Expand Down
60 changes: 49 additions & 11 deletions python/feldera/sql_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from feldera.output_handler import OutputHandler
from feldera._callback_runner import CallbackRunner, _CallbackRunnerInstruction
from feldera.formats import JSONFormat, CSVFormat
from feldera._helpers import validate_connector_input_format
from enum import Enum


Expand Down Expand Up @@ -407,10 +408,7 @@ def connect_source_kafka(
if config.get("topics") is None:
raise ValueError("topics is required in the config")

fmt = fmt.to_dict()

if fmt.get("config").get("update_format") is None:
raise ValueError("update_format not set in the format config; consider using: .with_update_format()")
validate_connector_input_format(fmt)

connector = Connector(
name=connector_name,
Expand All @@ -419,7 +417,7 @@ def connect_source_kafka(
"name": "kafka_input",
"config": config,
},
"format": fmt,
"format": fmt.to_dict(),
}
)

Expand All @@ -428,7 +426,13 @@ def connect_source_kafka(
else:
self.input_connectors_buffer[table_name] = [connector]

def connect_sink_kafka(self, view_name: str, connector_name: str, config: dict, fmt: JSONFormat | CSVFormat):
def connect_sink_kafka(
self,
view_name: str,
connector_name: str,
config: dict,
fmt: JSONFormat | CSVFormat
):
"""
Associates the specified kafka topic on the specified Kafka server as output sink for the specified view in
Feldera. The topic is populated with changes in the specified view.
Expand All @@ -445,10 +449,7 @@ def connect_sink_kafka(self, view_name: str, connector_name: str, config: dict,
if config.get("topic") is None:
raise ValueError("topic is required in the config")

fmt = fmt.to_dict()

if fmt.get("config").get("update_format") is None:
raise ValueError("update_format not set in the format config; consider using: .with_update_format()")
validate_connector_input_format(fmt)

connector = Connector(
name=connector_name,
Expand All @@ -457,7 +458,7 @@ def connect_sink_kafka(self, view_name: str, connector_name: str, config: dict,
"name": "kafka_output",
"config": config,
},
"format": fmt,
"format": fmt.to_dict(),
}
)

Expand All @@ -466,6 +467,43 @@ def connect_sink_kafka(self, view_name: str, connector_name: str, config: dict,
else:
self.output_connectors_buffer[view_name] = [connector]

def connect_source_url(
self,
table_name: str,
connector_name: str,
path: str,
fmt: JSONFormat | CSVFormat
):
"""
Associates the specified URL as input source for the specified table in Feldera.
Feldera will make a GET request to the specified URL to read the data and populate the table.

:param table_name: The name of the table.
:param connector_name: The unique name for this connector.
:param path: The URL to read the data from.
:param fmt: The format of the data in the URL.
"""

validate_connector_input_format(fmt)

connector = Connector(
name=connector_name,
config={
"transport": {
"name": "url_input",
"config": {
"path": path
}
},
"format": fmt.to_dict(),
}
)

if table_name in self.input_connectors_buffer:
self.input_connectors_buffer[table_name].append(connector)
else:
self.input_connectors_buffer[table_name] = [connector]

def run_to_completion(self):
"""
.. _run_to_completion:
Expand Down
27 changes: 25 additions & 2 deletions python/tests/test_wireframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd

from feldera import SQLContext, SQLSchema
from feldera.formats import JSONFormat, UpdateFormat
from feldera.formats import JSONFormat, JSONUpdateFormat
from tests import TEST_CLIENT


Expand Down Expand Up @@ -160,7 +160,7 @@ def test_kafka(self):
"auto.offset.reset": "earliest",
}

kafka_format = JSONFormat().with_update_format(UpdateFormat.InsertDelete).with_array(False)
kafka_format = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)

sink_config = {
"topic": OUTPUT_TOPIC,
Expand All @@ -178,6 +178,29 @@ def test_kafka(self):
df = out.to_pandas()
assert df.shape[0] != 0

def test_http_get(self):
sql = SQLContext("test_http_get", TEST_CLIENT).get_or_create()

TBL_NAME = "items"
VIEW_NAME = "s"

sql.register_table(TBL_NAME, SQLSchema({"id": "INT", "name": "STRING"}))

sql.register_view(VIEW_NAME, f"SELECT * FROM {TBL_NAME}")

path = "https://feldera-basics-tutorial.s3.amazonaws.com/part.json"

fmt = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)
sql.connect_source_url(TBL_NAME, "part", path, fmt)

out = sql.listen(VIEW_NAME)

sql.run_to_completion()

df = out.to_pandas()

assert df.shape[0] == 3


if __name__ == '__main__':
unittest.main()