forked from feldera/feldera
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_helpers.py
More file actions
117 lines (96 loc) · 2.95 KB
/
_helpers.py
File metadata and controls
117 lines (96 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import uuid
import pandas as pd
from decimal import Decimal
def sql_type_to_pandas_type(sql_type: str):
"""
Converts a SQL type to a pandas type.
"""
match sql_type.upper():
case "UUID":
return None
case "BOOLEAN":
return "boolean"
case "TINYINT":
return "Int8"
case "SMALLINT":
return "Int16"
case "INTEGER":
return "Int32"
case "BIGINT":
return "Int64"
case "REAL":
return "Float32"
case "DOUBLE":
return "Float64"
case "DECIMAL":
return None
case "CHAR":
return "str"
case "VARCHAR":
return "str"
case "DATE" | "TIMESTAMP":
return "datetime64[ns]"
case "TIME" | "INTERVAL":
return "timedelta64[ns]"
case "ARRAY":
return None
case "NULL":
return None
case "BINARY" | "VARBINARY":
return None
case "STRUCT" | "MAP":
return None
def ensure_dataframe_has_columns(df: pd.DataFrame):
"""
Ensures that the DataFrame has column names set.
"""
if [v for v in range(df.shape[1])] == list(df.columns):
raise ValueError(
"""
DataFrame has no column names set.
Input DataFrame must have column names set and they must be consistent with the columns in the input table.
"""
)
def dataframe_from_response(buffer: list[list[dict]], schema: dict):
"""
Converts the response from Feldera to a pandas DataFrame.
"""
pd_schema = {}
decimal_col = []
uuid_col = []
for column in schema["fields"]:
column_name = column["name"]
if not column["case_sensitive"]:
column_name = column_name.lower()
column_type = column["columntype"]["type"]
if column_type == "DECIMAL":
decimal_col.append(column_name)
elif column_type == "UUID":
uuid_col.append(column_name)
pd_schema[column_name] = sql_type_to_pandas_type(column_type)
data = [
{**item["insert"], "insert_delete": 1}
if "insert" in item
else {**item["delete"], "insert_delete": -1}
for sublist in buffer
for item in sublist
]
if len(decimal_col) != 0:
for datum in data:
for col in decimal_col:
if datum[col] is not None:
datum[col] = Decimal(datum[col])
if len(uuid_col) != 0:
for datum in data:
for col in uuid_col:
if datum[col] is not None:
datum[col] = uuid.UUID(datum[col])
df = pd.DataFrame(data)
df = df.astype(pd_schema)
return df
def chunk_dataframe(df, chunk_size=1000):
"""
Yield successive n-sized chunks from the given dataframe.
"""
for i in range(0, len(df), chunk_size):
yield df.iloc[i : i + chunk_size]