Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions crates/pipeline-types/src/format/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ pub enum JsonFlavor {
/// JSON format accepted by the Kafka Connect `JsonConverter` class.
#[serde(rename = "kafka_connect_json_converter")]
KafkaConnectJsonConverter,
#[serde(rename = "pandas")]
Pandas,
/// Parquet to-json format.
/// (For internal use only)
#[serde(skip)]
Expand Down
6 changes: 6 additions & 0 deletions crates/pipeline-types/src/serde_with_context/serde_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,12 @@ impl From<JsonFlavor> for SqlSerdeConfig {
timestamp_format: TimestampFormat::String("%Y-%m-%dT%H:%M:%S%.f%:z"),
decimal_format: DecimalFormat::String,
},
JsonFlavor::Pandas => Self {
time_format: TimeFormat::String("%H:%M:%S%.f"),
date_format: DateFormat::String("%Y-%m-%d"),
timestamp_format: TimestampFormat::MillisSinceEpoch,
decimal_format: DecimalFormat::String,
},
JsonFlavor::ParquetConverter => Self {
time_format: TimeFormat::Nanos,
date_format: DateFormat::String("%Y-%m-%d"),
Expand Down
3 changes: 2 additions & 1 deletion openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -3790,7 +3790,8 @@
"default",
"debezium_mysql",
"snowflake",
"kafka_connect_json_converter"
"kafka_connect_json_converter",
"pandas"
]
},
"JsonParserConfig": {
Expand Down
10 changes: 9 additions & 1 deletion python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,12 @@ sphinx-apidoc -o . ../feldera
make html
```

To clean the build, run `make clean`.
To clean the build, run `make clean`.

## Testing


```bash
cd python
python3 -m unittest
```
9 changes: 9 additions & 0 deletions python/feldera/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,12 @@ def validate_connector_input_format(fmt: Format):

if isinstance(fmt, JSONFormat) and fmt.config.get("update_format") is None:
raise ValueError("update_format not set in the format config; consider using: .with_update_format()")


def chunk_dataframe(df, chunk_size=1000):
"""
Yield successive n-sized chunks from the given dataframe.
"""

for i in range(0, len(df), chunk_size):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting that iloc does not throw any errors when selecting a range beyond its size.

yield df.iloc[i:i + chunk_size]
8 changes: 6 additions & 2 deletions python/feldera/output_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, qu

# the callback that is passed to the `CallbackRunner`
def callback(df: pd.DataFrame, _: int):
self.buffer.append(df)
if not df.empty:
self.buffer.append(df)

# sets up the callback runner
self.handler = CallbackRunner(self.client, self.pipeline_name, self.view_name, callback, queue)
Expand All @@ -38,4 +39,7 @@ def to_pandas(self):
"""

self.handler.join()
return pd.concat(self.buffer)

if len(self.buffer) == 0:
return pd.DataFrame()
return pd.concat(self.buffer, ignore_index=True)
18 changes: 16 additions & 2 deletions python/feldera/rest/_httprequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
from typing import Callable, Optional, Any, Union, Mapping, Sequence, List


def json_serialize(body: Any) -> str:
return json.dumps(body) if body else "" if body == "" else "null"


class HttpRequests:
def __init__(self, config: Config) -> None:
self.config = config
Expand All @@ -28,6 +32,7 @@ def send_request(
content_type: str = "application/json",
params: Optional[Mapping[str, Any]] = None,
stream: bool = False,
serialize: bool = True,
) -> Any:
"""
:param http_method: The HTTP method to use. Takes the equivalent `requests.*` module. (Example: `requests.get`)
Expand All @@ -36,6 +41,7 @@ def send_request(
:param content_type: The value for `Content-Type` HTTP header. "application/json" by default.
:param params: The query parameters part of this request.
:param stream: True if the response is expected to be a HTTP stream.
:param serialize: True if the body needs to be serialized to JSON.
"""
self.headers["Content-Type"] = content_type

Expand Down Expand Up @@ -71,7 +77,7 @@ def send_request(
request_path,
timeout=timeout,
headers=headers,
data=json.dumps(body) if body else "" if body == "" else "null",
data=json_serialize(body) if serialize else body,
params=params,
stream=stream,
)
Expand Down Expand Up @@ -102,8 +108,16 @@ def post(
content_type: Optional[str] = "application/json",
params: Optional[Mapping[str, Any]] = None,
stream: bool = False,
serialize: bool = True,
) -> Any:
return self.send_request(requests.post, path, body, content_type, params, stream=stream)
return self.send_request(
requests.post,
path,
body,
content_type,
params, stream=stream,
serialize=serialize
)

def patch(
self,
Expand Down
23 changes: 20 additions & 3 deletions python/feldera/rest/feldera_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,19 @@ def __init__(
"""
:param url: The url to Feldera API (ex: https://try.feldera.com)
:param api_key: The optional API key for Feldera
:param timeout: (optional) The amount of time in seconds that the cient will wait for a response beforing timing
:param timeout: (optional) The amount of time in seconds that the client will wait for a response before timing
out.
"""

self.config = Config(url, api_key, timeout)
self.http = HttpRequests(self.config)

try:
self.programs()
except Exception as e:
logging.error(f"Failed to connect to Feldera API: {e}")
raise e

def programs(self) -> list[Program]:
"""
Get all programs
Expand Down Expand Up @@ -381,6 +387,8 @@ def push_to_pipeline(
array: bool = False,
force: bool = False,
update_format: str = "raw",
json_flavor: str = None,
serialize: bool = True,
):
"""
Insert data into a pipeline
Expand All @@ -394,8 +402,10 @@ def push_to_pipeline(
:param force: If True, the data will be inserted even if the pipeline is paused
:param update_format: JSON data change event format, used in conjunction with the "json" format,
the default value is "insert_delete", other supported formats: "weighted", "debezium", "snowflake", "raw"

:param json_flavor: JSON encoding used for individual table records, the default value is "default", other supported encodings:
"debezium_mysql", "snowflake", "kafka_connect_json_converter", "pandas"
:param data: The data to insert
:param serialize: If True, the data will be serialized to JSON. True by default
"""

if format not in ["json", "csv"]:
Expand All @@ -404,6 +414,9 @@ def push_to_pipeline(
if update_format not in ["insert_delete", "weighted", "debezium", "snowflake", "raw"]:
raise ValueError("update_format must be one of 'insert_delete', 'weighted', 'debezium', 'snowflake', 'raw'")

if json_flavor is not None and json_flavor not in ["default", "debezium_mysql", "snowflake", "kafka_connect_json_converter", "pandas"]:
raise ValueError("json_flavor must be one of 'default', 'debezium_mysql', 'snowflake', 'kafka_connect_json_converter', 'pandas'")

# python sends `True` which isn't accepted by the backend
array = _prepare_boolean_input(array)
force = _prepare_boolean_input(force)
Expand All @@ -417,6 +430,9 @@ def push_to_pipeline(
params["array"] = array
params["update_format"] = update_format

if json_flavor is not None:
params["json_flavor"] = json_flavor

content_type = "application/json"

if format == "csv":
Expand All @@ -428,6 +444,7 @@ def push_to_pipeline(
params=params,
content_type=content_type,
body=data,
serialize=serialize,
)

def listen_to_pipeline(
Expand Down Expand Up @@ -493,4 +510,4 @@ def listen_to_pipeline(
if end and time.time() > end:
break
if chunk:
yield json.loads(chunk)
yield json.loads(chunk)
18 changes: 14 additions & 4 deletions python/feldera/sql_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from typing import Optional, Dict, Callable

import pandas as pd
from typing_extensions import Self
from queue import Queue

Expand All @@ -18,9 +19,9 @@
from feldera._callback_runner import CallbackRunner, _CallbackRunnerInstruction
from feldera._helpers import ensure_dataframe_has_columns
from feldera.formats import JSONFormat, CSVFormat, AvroFormat
from feldera._helpers import validate_connector_input_format
from feldera.resources import Resources
from feldera.enums import BuildMode, CompilationProfile
from feldera._helpers import validate_connector_input_format, chunk_dataframe


def _table_name_from_sql(ddl: str) -> str:
Expand Down Expand Up @@ -72,7 +73,7 @@ def __init__(
# TODO: to be used for schema inference
self.todo_tables: Dict[str, Optional[SQLTable]] = {}

self.http_input_buffer: list[Dict[str, dict | list[dict] | str]] = []
self.http_input_buffer: list[Dict[str, pd.DataFrame]] = []

# buffer that stores all input connectors to be created
# this is a Mapping[table_name -> list[Connector]]
Expand Down Expand Up @@ -173,7 +174,16 @@ def __push_http_inputs(self):

for input_buffer in self.http_input_buffer:
for tbl_name, data in input_buffer.items():
self.client.push_to_pipeline(self.pipeline_name, tbl_name, "json", data, array=True)
for datum in chunk_dataframe(data):
self.client.push_to_pipeline(
self.pipeline_name,
tbl_name,
"json",
datum.to_json(orient='records', date_format='epoch'),
json_flavor='pandas',
array=True,
serialize=False
)

self.http_input_buffer.clear()

Expand Down Expand Up @@ -273,7 +283,7 @@ def connect_source_pandas(self, table_name: str, df: pandas.DataFrame):

if tbl:
# tbl.validate_schema(df) TODO: something like this would be nice
self.http_input_buffer.append({tbl.name: df.to_dict('records')})
self.http_input_buffer.append({tbl.name: df})
return

tbl = self.todo_tables.get(table_name)
Expand Down
4 changes: 2 additions & 2 deletions python/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,18 +135,18 @@ def test_listen_to_pipeline(self):
name = str(uuid.uuid4())
self.test_create_pipeline(name, False)

TEST_CLIENT.start_pipeline(name)
TEST_CLIENT.pause_pipeline(name)

t1 = threading.Thread(target=self.__listener, args=(name,))
t1.start()

TEST_CLIENT.start_pipeline(name)
TEST_CLIENT.push_to_pipeline(name, "tbl", "csv", data)

t1.join()

assert self.result

TEST_CLIENT.pause_pipeline(name)
TEST_CLIENT.shutdown_pipeline(name)
TEST_CLIENT.delete_pipeline(name)

Expand Down
27 changes: 26 additions & 1 deletion python/tests/test_wireframes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import time
import unittest
import pandas as pd
from kafka import KafkaProducer, KafkaConsumer, TopicPartition
from kafka import KafkaProducer, KafkaConsumer
from kafka.admin import KafkaAdminClient, NewTopic

from feldera import SQLContext, SQLSchema
Expand Down Expand Up @@ -324,6 +324,31 @@ def test_pipeline_resource_config(self):

assert TEST_CLIENT.get_pipeline(name).config["resources"] == config

def test_timestamp_pandas(self):
sql = SQLContext("test_timestamp_pandas", TEST_CLIENT).get_or_create()

TBL_NAME = "items"
VIEW_NAME = "s"

# backend doesn't support TIMESTAMP of format: "2024-06-06T18:06:28.443"
sql.register_table(TBL_NAME, SQLSchema({"id": "INT", "name": "STRING", "birthdate": "TIMESTAMP"}))

sql.register_view(VIEW_NAME, f"SELECT * FROM {TBL_NAME}")

df = pd.DataFrame({"id": [1, 2, 3], "name": ["a", "b", "c"], "birthdate": [
pd.Timestamp.now(), pd.Timestamp.now(), pd.Timestamp.now()
]})

sql.connect_source_pandas(TBL_NAME, df)

out = sql.listen(VIEW_NAME)

sql.run_to_completion()

df = out.to_pandas()

assert df.shape[0] == 3


if __name__ == '__main__':
unittest.main()