forked from feldera/feldera
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_helpers.py
More file actions
100 lines (80 loc) · 2.51 KB
/
_helpers.py
File metadata and controls
100 lines (80 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
from decimal import Decimal
def sql_type_to_pandas_type(sql_type: str):
"""
Converts a SQL type to a pandas type.
"""
match sql_type.upper():
case 'BOOLEAN':
return 'boolean'
case 'TINYINT':
return 'Int8'
case 'SMALLINT':
return 'Int16'
case 'INTEGER':
return 'Int32'
case 'BIGINT':
return 'Int64'
case 'REAL':
return 'Float32'
case 'DOUBLE':
return 'Float64'
case 'DECIMAL':
return None
case 'CHAR':
return 'str'
case 'VARCHAR':
return 'str'
case 'DATE' | 'TIMESTAMP':
return 'datetime64[ns]'
case 'TIME' | 'INTERVAL':
return 'timedelta64[ns]'
case 'ARRAY':
return None
case 'NULL':
return None
case 'BINARY' | 'VARBINARY':
return None
case 'STRUCT' | 'MAP':
return None
def ensure_dataframe_has_columns(df: pd.DataFrame):
"""
Ensures that the DataFrame has column names set.
"""
if [v for v in range(df.shape[1])] == list(df.columns):
raise ValueError(
"""
DataFrame has no column names set.
Input DataFrame must have column names set and they must be consistent with the columns in the input table.
"""
)
def dataframe_from_response(buffer: list[list[dict]], schema: dict):
"""
Converts the response from Feldera to a pandas DataFrame.
"""
pd_schema = {}
decimal_col = []
for column in schema['fields']:
column_name = column['name']
column_type = column['columntype']['type']
if column_type == 'DECIMAL':
decimal_col.append(column_name)
pd_schema[column_name] = sql_type_to_pandas_type(column_type)
data = [
{**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
for sublist in buffer for item in sublist
]
if len(decimal_col) != 0:
for datum in data:
for col in decimal_col:
if datum[col] is not None:
datum[col] = Decimal(datum[col])
df = pd.DataFrame(data)
df = df.astype(pd_schema)
return df
def chunk_dataframe(df, chunk_size=1000):
"""
Yield successive n-sized chunks from the given dataframe.
"""
for i in range(0, len(df), chunk_size):
yield df.iloc[i:i + chunk_size]