-
Notifications
You must be signed in to change notification settings - Fork 237
Expand file tree
/
Copy pathndarray.py
More file actions
291 lines (230 loc) · 8.86 KB
/
ndarray.py
File metadata and controls
291 lines (230 loc) · 8.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
from typing import TYPE_CHECKING, Tuple, Sequence, Optional, List, Any
import numpy as np
if TYPE_CHECKING: # pragma: no cover
from docarray.typing import ArrayType
from docarray import Document, DocumentArray
def unravel(docs: Sequence['Document'], field: str) -> Optional['ArrayType']:
_first = getattr(docs[0], field)
if _first is None:
# failed to unravel, return as a list
r = [getattr(d, field) for d in docs]
if any(_rr is not None for _rr in r):
return r
else:
return None
framework, is_sparse = get_array_type(_first)
cls_type = type(_first)
all_fields = [getattr(d, field) for d in docs]
none_idx = [idx for idx, v in enumerate(all_fields) if v is None]
if none_idx:
raise ValueError(
f'Document{none_idx}.{field} is None. Can not stack into `{field}s`.'
)
if framework == 'python':
return cls_type(all_fields)
elif framework == 'numpy':
return np.stack(all_fields)
elif framework == 'tensorflow':
import tensorflow as tf
return tf.stack(all_fields)
elif framework == 'torch':
import torch
return torch.stack(all_fields)
elif framework == 'paddle':
import paddle
return paddle.stack(all_fields)
elif framework == 'scipy':
import scipy.sparse
return cls_type(scipy.sparse.vstack(all_fields))
def ravel(value: 'ArrayType', docs: 'DocumentArray', field: str) -> None:
"""Ravel :attr:`value` into ``doc.field`` of each documents
:param docs: the docs to set
:param field: the field of the doc to set
:param value: the value to be set on ``doc.field``
"""
use_get_row = False
if hasattr(value, 'getformat'):
# for scipy only
sp_format = value.getformat()
if sp_format in {'bsr', 'coo'}:
# for BSR and COO, they dont implement [j, ...] in scipy
# but they offer get_row() API which implicitly translate the
# sparse row into CSR format, hence needs to convert back
# not very efficient, but this is the best we can do.
use_get_row = True
if use_get_row:
emb_shape0 = value.shape[0]
for d, j in zip(docs, range(emb_shape0)):
row = getattr(value.getrow(j), f'to{sp_format}')()
docs[d.id, field] = row
elif isinstance(value, (list, tuple)):
for d, j in zip(docs, value):
docs[d.id, field] = j
else:
emb_shape0 = value.shape[0]
for d, j in zip(docs, range(emb_shape0)):
docs[d.id, field] = value[j, ...]
def get_array_type(
array: 'ArrayType', raise_error_if_not_array: bool = True
) -> Tuple[str, bool]:
"""Get the type of ndarray without importing the framework
:param array: any array, scipy, numpy, tf, torch, etc.
:return: a tuple where the first element represents the framework, the second represents if it is sparse array
"""
module_tags = array.__class__.__module__.split('.')
class_name = array.__class__.__name__
if isinstance(array, (list, tuple)):
return 'python', False
if 'numpy' in module_tags:
return 'numpy', False
if 'docarray' in module_tags:
if class_name == 'NdArray':
return 'docarray', False # sparse or not is irrelevant
if 'docarray_pb2' in module_tags:
if class_name == 'NdArrayProto':
return 'docarray_proto', False # sparse or not is irrelevant
if 'tensorflow' in module_tags:
if class_name == 'SparseTensor':
return 'tensorflow', True
if class_name == 'Tensor' or class_name == 'EagerTensor':
return 'tensorflow', False
if 'torch' in module_tags and class_name == 'Tensor':
return 'torch', array.is_sparse
if 'paddle' in module_tags and class_name == 'Tensor':
# Paddle does not support sparse tensor on 11/8/2021
# https://github.com/PaddlePaddle/Paddle/issues/36697
return 'paddle', False
if 'scipy' in module_tags and 'sparse' in module_tags:
return 'scipy', True
if raise_error_if_not_array:
if array is not None:
raise TypeError(
f'can not determine the array type: {module_tags}.{class_name}'
)
else:
raise ValueError(
f'Empty ndarray. Did you forget to set .embedding/.tensor value and now you are operating on it?'
)
else:
return 'python', False
def to_numpy_array(value) -> 'np.ndarray':
"""Return the value always in :class:`numpy.ndarray` regardless the framework type.
:return: the value in :class:`numpy.ndarray`.
"""
v = value
framework, is_sparse = get_array_type(value)
if is_sparse:
if hasattr(v, 'todense'):
v = v.todense()
elif hasattr(v, 'to_dense'):
v = v.to_dense()
elif framework == 'tensorflow':
import tensorflow as tf
if isinstance(v, tf.SparseTensor):
v = tf.sparse.to_dense(v)
if hasattr(v, 'numpy'):
v = v.numpy()
if framework == 'python':
v = np.array(v)
return v
def to_list(value) -> List[float]:
r = to_numpy_array(value)
if isinstance(r, np.ndarray):
return r.tolist()
elif isinstance(r, list):
return r
else:
raise TypeError(f'{r} can not be converted into list')
def get_array_rows(array: 'ArrayType') -> Tuple[int, int]:
"""Get the number of rows of the ndarray without importing all frameworks
:param array: input array
:return: (num_rows, ndim)
Examples
>>> get_array_rows([1,2,3])
1, 1
>>> get_array_rows([[1,2,3], [4,5,6]])
2, 2
>>> get_array_rows([[1,2,3], [4,5,6], [7,8,9]])
3, 2
>>> get_array_rows(np.array([[1,2,3], [4,5,6], [7,8,9]]))
3, 2
"""
array_type, _ = get_array_type(array)
if array_type == 'python':
first_element_list_like = isinstance(array[0], (list, tuple))
num_rows = len(array) if first_element_list_like else 1
ndim = 2 if first_element_list_like else 1
elif array_type in ('numpy', 'tensorflow', 'torch', 'paddle', 'scipy'):
ndim = array.ndim
if ndim == 1:
num_rows = 1
else:
num_rows = array.shape[0]
else:
raise ValueError
return num_rows, ndim
def check_arraylike_equality(x: 'ArrayType', y: 'ArrayType'):
"""Check if two array type objects are the same with the supported frameworks.
Examples
>>> import numpy as np
x = np.array([[1,2,0,0,3],[1,2,0,0,3]])
check_arraylike_equality(x,x)
True
>>> from scipy import sparse as sp
x = sp.csr_matrix([[1,2,0,0,3],[1,2,0,0,3]])
check_arraylike_equality(x,x)
True
>>> import torch
x = torch.tensor([1,2,3])
check_arraylike_equality(x,x)
True
"""
x_type, x_is_sparse = get_array_type(x)
y_type, y_is_sparse = get_array_type(y)
same_array = False
if x_type == y_type and x_is_sparse == y_is_sparse:
if x_type == 'python':
same_array = x == y
if x_type == 'numpy':
# Numpy does not support sparse tensors
import numpy as np
same_array = np.array_equal(x, y)
elif x_type == 'torch':
import torch
if x_is_sparse:
# torch.equal NotImplementedError for sparse
same_array = all((x - y).coalesce().values() == 0)
else:
same_array = torch.equal(x, y)
elif x_type == 'scipy':
# Not implemented in scipy this should work for all types
# Note: you can't simply look at nonzero values because they can be in
# different positions.
if x.shape != y.shape:
same_array = False
else:
same_array = (x != y).nnz == 0
elif x_type == 'tensorflow':
if x_is_sparse:
same_array = x == y
else:
# Does not have equal implemented, only elementwise, therefore reduce .all is needed
same_array = (x == y).numpy().all()
elif x_type == 'paddle':
# Paddle does not support sparse tensor on 11/8/2021
# https://github.com/PaddlePaddle/Paddle/issues/36697
# Does not have equal implemented, only elementwise, therefore reduce .all is needed
same_array = (x == y).numpy().all()
return same_array
else:
return same_array
def detach_tensor_if_present(x: Any) -> Any:
"""Check if input is a dense torch array and detaches the tensor from the current graph.
:param array: input array
:return: (num_rows, ndim)
"""
x_type, x_sparse = get_array_type(x, raise_error_if_not_array=False)
if x_type == 'torch' and not x_sparse:
import torch
x = torch.tensor(x.detach().numpy())
return x