forked from tableau/document-api-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquery.py
More file actions
299 lines (257 loc) · 12.6 KB
/
query.py
File metadata and controls
299 lines (257 loc) · 12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
import re
import pandas as pd
import json
import numpy as np
from tableaudocumentapi.utils import _clean_aggregated_column_names
class Query(object):
"""A class for querying the parsed elements of the Tableau Workbook"""
def __init__(self, workbook):
self._workbook = workbook
self._xml = workbook._workbookRoot
self._worksheet_dashboard_map = self._get_worksheet_dashboard_map()
def _get_worksheet_dashboard_map(self):
worksheet_dashboard_map = {}
for this_worksheet in self._workbook.worksheet_objects:
worksheet_dashboard_map[this_worksheet] = []
for dashboard, worksheets_in_dashboard in self._workbook.dashboard_objects.items():
if this_worksheet in worksheets_in_dashboard.worksheets:
worksheet_dashboard_map[this_worksheet].append(dashboard)
return worksheet_dashboard_map
def get_worksheet_dependencies(self):
worksheet_dependencies = []
for worksheet in self._workbook.worksheet_objects.values():
for dependency in worksheet.datasource_dependencies:
for column_instance in dependency.column_instances.values():
worksheet_dependencies.append({
"Worksheet":worksheet.name,
"Datasource":dependency.datasource,
"Columns":dependency.columns,
"Column_instance":column_instance.get('column'),
"Column_instance_Derivation":column_instance.get('derivation'),
"Column_instance_Name":column_instance.get('name'),
"Column_instance_Pivot":column_instance.get('pivot'),
"Column_instance_Type":column_instance.get('type')
})
return worksheet_dependencies
def normalize_groupfilter(self, filter_json):
if not filter_json:
return []
# Ensure list-of-dicts structure; explode safely
df = pd.json_normalize(filter_json)
if 'children' in df.columns:
df = df.explode('children', ignore_index=True)
return df.to_dict(orient="records")
def normalize_groupfilter(self, filter_json):
if not filter_json:
return []
# Ensure list input
queue = filter_json if isinstance(filter_json, list) else [filter_json]
out = []
stack = [(None, 0, node) for node in queue] # (parent_idx, depth, node)
while stack:
parent_idx, depth, node = stack.pop(0)
if not isinstance(node, dict):
continue
rec = {
'function': node.get('function'),
'level': node.get('level'),
'member': node.get('member'),
'depth': depth,
'parent_index': parent_idx
}
attrs = node.get('attributes') or {}
if isinstance(attrs, dict):
for k, v in attrs.items():
rec[f'attributes.{k}'] = v
# remember our index before appending children
this_index = len(out)
out.append(rec)
for child in (node.get('children') or []):
stack.append((this_index, depth + 1, child))
# flatten attributes.* keys into columns
df = pd.json_normalize(out, sep='.')
return df.to_dict(orient='records')
def normalize_worksheet_filters(self, worksheet_filters):
for wf in worksheet_filters:
gf = wf.get('Groupfilters') or []
wf['normalized_groupfilter'] = self.normalize_groupfilter(gf) if gf else []
# Build frame even if some rows have empty normalized lists
wf2 = pd.DataFrame(worksheet_filters)
# Explode safely (empty lists will repeat row once)
wf2 = wf2.explode('normalized_groupfilter', ignore_index=True)
wf_norm = pd.json_normalize(wf2['normalized_groupfilter'])
wf_norm = wf_norm.add_prefix('groupfilter_')
return wf2.join(wf_norm)
def get_worksheet_filters(self):
worksheet_filters = []
for worksheet in self._workbook.worksheet_objects.values():
for f in worksheet.filters:
worksheet_filters.append({
"Worksheet": worksheet.name,
"Filter_class": f.filter_class,
"Datasource": f.datasource,
"Column": f.column,
"Groupfilters": f.groupfilters
})
return self.normalize_worksheet_filters(worksheet_filters)
def get_worksheet_rows(self):
out = []
for ws in self._workbook.worksheet_objects.values():
for r in ws.rows:
ds_name, row_name = _clean_aggregated_column_names(r) or (None, None)
out.append({"Worksheet": ws.name, "Datasource": ds_name, "Row": row_name})
return out
def get_worksheet_cols(self):
out = []
for ws in self._workbook.worksheet_objects.values():
for c in ws.cols:
ds_name, col_name = _clean_aggregated_column_names(c) or (None, None)
out.append({"Worksheet": ws.name, "Datasource": ds_name, "Col": col_name})
return out
def get_field_objects(self, column, datasource_name = None):
"""Link filter column or worksheets rows/cols to actual Field object from datasource"""
if not isinstance(column, str) or not column:
return None
if datasource_name is None:
# Extract field name from column reference
# '[federated.xxx].[Table]' -> 'Table'
result = re.split(r'(?<=\])\.(?=\[)', column)
if len(result) == 2:
datasource_name = result[0][1:-1]
field_name = result[1]
else:
return None
else:
field_name = column
# Find matching field in datasources
ds = getattr(self._workbook, '_datasource_index', {}).get(datasource_name)
if not ds:
return None
return ds.fields.get(field_name)
def get_workbook_parameters(self):
"""Get all Parameters in workbook and their attributes as a list of dictionaries"""
workbook_parameters = []
for datasource in self._workbook.datasources:
if datasource.name == "Parameters":
for field in datasource.fields:
workbook_parameters.append({
"Alias": datasource.fields[field].alias,
"Aliases": datasource.fields[field].aliases,
"Calculation": datasource.fields[field].calculation,
"Caption": datasource.fields[field].caption,
"Datatype": datasource.fields[field].datatype,
"Name": datasource.fields[field].name,
"Parameter_Domain_Type": datasource.fields[field].param_domain_type,
"Role": datasource.fields[field].role,
"Type": datasource.fields[field].type,
"Value": datasource.fields[field].value,
"Worksheets": datasource.fields[field].worksheets,
"Members": datasource.fields[field].members
})
return workbook_parameters
def get_workbook_fields(self):
"""All non-parameter fields (+ attributes)."""
field_attrs = [
'alias','aliases','calculation','caption','datatype','default_aggregation',
'description','hidden','id','is_nominal','is_ordinal','is_quantitative',
'members','name','param_domain_type','role','table','type','value','worksheets'
]
rows = []
for ds in self._workbook.datasources:
if ds.name == "Parameters":
continue
for key, field in ds.fields.items():
if isinstance(key, str) and key.startswith('[') and key.endswith(']'):
row = {
'datasource': ds.name,
'datasource_caption': getattr(ds, 'caption', None),
'field_key': key,
}
for attr in field_attrs:
row[attr] = getattr(field, attr, None)
rows.append(row)
return rows
def get_workbook_metadata_table(self):
df_dep = pd.DataFrame(self.get_worksheet_dependencies())
# use cached map, and stabilize order for diff-friendliness
ws_db_map = self._worksheet_dashboard_map
df_wsdb = pd.DataFrame({
"Worksheet": list(ws_db_map.keys()),
"Dashboard": [sorted(v) for v in ws_db_map.values()]
})
df = df_dep.merge(df_wsdb, how='left', on='Worksheet')
df_filters = pd.DataFrame(self.get_worksheet_filters())
if not df_filters.empty:
df = df.merge(df_filters.add_prefix('_filter_'),
left_on=['Datasource','Column_instance','Worksheet'],
right_on=['_filter_Datasource','_filter_Column','_filter_Worksheet'],
how='left')
df_rows = pd.DataFrame(self.get_worksheet_rows())
if not df_rows.empty:
df = df.merge(df_rows.add_prefix('_rows_'),
left_on=['Datasource','Column_instance','Worksheet'],
right_on=['_rows_Datasource','_rows_Row','_rows_Worksheet'],
how='left')
df_cols = pd.DataFrame(self.get_worksheet_cols())
if not df_cols.empty:
df = df.merge(df_cols.add_prefix('_cols_'),
left_on=['Datasource','Column_instance','Worksheet'],
right_on=['_cols_Datasource','_cols_Col','_cols_Worksheet'],
how='left')
df_fields = pd.DataFrame(self.get_workbook_fields())
if not df_fields.empty:
df = df.merge(df_fields.add_prefix('_fields_'),
left_on=['Datasource','Column_instance'],
right_on=['_fields_datasource','_fields_field_key'],
how='left')
return df
@staticmethod
def json_safe_dataframe(df):
"""Convert columns containing dicts/lists/ndarrays to JSON strings (leaves scalars alone)."""
df = df.copy()
def to_json_if_needed(x):
# Convert JSON-serializable containers
if isinstance(x, (dict, list)):
return json.dumps(x, ensure_ascii=False)
# Convert numpy arrays to lists first
if isinstance(x, np.ndarray):
return json.dumps(x.tolist(), ensure_ascii=False)
# Leave everything else (including None/NaN) unchanged
return x
for col in df.columns:
if df[col].apply(lambda v: isinstance(v, (dict, list, np.ndarray))).any():
df[col] = df[col].apply(to_json_if_needed)
return df
@staticmethod
def compare_workbooks(wb1_filename, wb2_filename, wb1_twb_string=None, wb2_twb_string=None):
from tableaudocumentapi import Workbook
if wb1_twb_string:
d1 = Workbook(twb_xml_string=wb1_twb_string).query.get_workbook_metadata_table()
else:
d1 = Workbook(wb1_filename).query.get_workbook_metadata_table()
if wb2_twb_string:
d2 = Workbook(twb_xml_string=wb2_twb_string).query.get_workbook_metadata_table()
else:
d2 = Workbook(wb2_filename).query.get_workbook_metadata_table()
d1 = Query.json_safe_dataframe(d1)
d2 = Query.json_safe_dataframe(d2)
# Partition columns
def split_cols(cols):
id_cols = [c for c in cols if isinstance(c, str) and c and c[0] != '_']
val_cols = [c for c in cols if isinstance(c, str) and c and c[0] == '_']
# If nothing qualifies as id, keep at least 'Worksheet' and 'Datasource' if present
if not id_cols:
id_cols = [c for c in ['Worksheet','Datasource','Column_instance'] if c in cols]
return id_cols, val_cols
id1, val1 = split_cols(d1.columns)
id2, val2 = split_cols(d2.columns)
d1m = d1.melt(id_vars=id1, value_vars=val1) if val1 else d1
d2m = d2.melt(id_vars=id2, value_vars=val2) if val2 else d2
out = pd.merge(d1m, d2m, how='outer', indicator=True)
out = out.rename(columns={'_merge': 'Workbook_Source'})
out['Workbook_Source'] = out['Workbook_Source'].map({
'left_only': 'wb1',
'right_only': 'wb2',
'both': 'both'
})
return out.drop_duplicates()