forked from docarray/docarray
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathndarray.py
More file actions
194 lines (148 loc) · 5.72 KB
/
ndarray.py
File metadata and controls
194 lines (148 loc) · 5.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
from typing import TYPE_CHECKING, Tuple, Sequence, Optional, List
import numpy as np
if TYPE_CHECKING:
from ..types import ArrayType
from .. import Document, DocumentArray
def unravel(docs: Sequence['Document'], field: str) -> Optional['ArrayType']:
_first = getattr(docs[0], field)
if _first is None:
# failed to unravel, return as a list
r = [getattr(d, field) for d in docs]
if any(_rr is not None for _rr in r):
return r
else:
return None
framework, is_sparse = get_array_type(_first)
all_fields = [getattr(d, field) for d in docs]
cls_type = type(_first)
if framework == 'python':
return cls_type(all_fields)
elif framework == 'numpy':
return np.stack(all_fields)
elif framework == 'tensorflow':
import tensorflow as tf
return tf.stack(all_fields)
elif framework == 'torch':
import torch
return torch.stack(all_fields)
elif framework == 'paddle':
import paddle
return paddle.stack(all_fields)
elif framework == 'scipy':
import scipy.sparse
return cls_type(scipy.sparse.vstack(all_fields))
def ravel(value: 'ArrayType', docs: 'DocumentArray', field: str) -> None:
"""Ravel :attr:`value` into ``doc.field`` of each documents
:param docs: the docs to set
:param field: the field of the doc to set
:param value: the value to be set on ``doc.field``
"""
use_get_row = False
if hasattr(value, 'getformat'):
# for scipy only
sp_format = value.getformat()
if sp_format in {'bsr', 'coo'}:
# for BSR and COO, they dont implement [j, ...] in scipy
# but they offer get_row() API which implicitly translate the
# sparse row into CSR format, hence needs to convert back
# not very efficient, but this is the best we can do.
use_get_row = True
if use_get_row:
emb_shape0 = value.shape[0]
for d, j in zip(docs, range(emb_shape0)):
row = getattr(value.getrow(j), f'to{sp_format}')()
docs[d.id, field] = row
elif isinstance(value, (list, tuple)):
for d, j in zip(docs, value):
docs[d.id, field] = j
else:
emb_shape0 = value.shape[0]
for d, j in zip(docs, range(emb_shape0)):
docs[d.id, field] = value[j, ...]
def get_array_type(array: 'ArrayType') -> Tuple[str, bool]:
"""Get the type of ndarray without importing the framework
:param array: any array, scipy, numpy, tf, torch, etc.
:return: a tuple where the first element represents the framework, the second represents if it is sparse array
"""
module_tags = array.__class__.__module__.split('.')
class_name = array.__class__.__name__
if isinstance(array, (list, tuple)):
return 'python', False
if 'numpy' in module_tags:
return 'numpy', False
if 'docarray' in module_tags:
if class_name == 'NdArray':
return 'docarray', False # sparse or not is irrelevant
if 'docarray_pb2' in module_tags:
if class_name == 'NdArrayProto':
return 'docarray_proto', False # sparse or not is irrelevant
if 'tensorflow' in module_tags:
if class_name == 'SparseTensor':
return 'tensorflow', True
if class_name == 'Tensor' or class_name == 'EagerTensor':
return 'tensorflow', False
if 'torch' in module_tags and class_name == 'Tensor':
return 'torch', array.is_sparse
if 'paddle' in module_tags and class_name == 'Tensor':
# Paddle does not support sparse tensor on 11/8/2021
# https://github.com/PaddlePaddle/Paddle/issues/36697
return 'paddle', False
if 'scipy' in module_tags and 'sparse' in module_tags:
return 'scipy', True
raise TypeError(f'can not determine the array type: {module_tags}.{class_name}')
def to_numpy_array(value) -> 'np.ndarray':
"""Return the value always in :class:`numpy.ndarray` regardless the framework type.
:return: the value in :class:`numpy.ndarray`.
"""
v = value
framework, is_sparse = get_array_type(value)
if is_sparse:
if hasattr(v, 'todense'):
v = v.todense()
elif hasattr(v, 'to_dense'):
v = v.to_dense()
elif framework == 'tensorflow':
import tensorflow as tf
if isinstance(v, tf.SparseTensor):
v = tf.sparse.to_dense(v)
if hasattr(v, 'numpy'):
v = v.numpy()
if framework == 'python':
v = np.array(v)
return v
def to_list(value) -> List[float]:
r = to_numpy_array(value)
if isinstance(r, np.ndarray):
return r.tolist()
elif isinstance(r, list):
return r
else:
raise TypeError(f'{r} can not be converted into list')
def get_array_rows(array: 'ArrayType') -> Tuple[int, int]:
"""Get the number of rows of the ndarray without importing all frameworks
:param array: input array
:return: (num_rows, ndim)
Examples
>>> get_array_rows([1,2,3])
1, 1
>>> get_array_rows([[1,2,3], [4,5,6]])
2, 2
>>> get_array_rows([[1,2,3], [4,5,6], [7,8,9]])
3, 2
>>> get_array_rows(np.array([[1,2,3], [4,5,6], [7,8,9]]))
3, 2
"""
array_type, _ = get_array_type(array)
if array_type == 'python':
first_element_list_like = isinstance(array[0], (list, tuple))
num_rows = len(array) if first_element_list_like else 1
ndim = 2 if first_element_list_like else 1
elif array_type in ('numpy', 'tensorflow', 'torch', 'paddle', 'scipy'):
ndim = array.ndim
if ndim == 1:
num_rows = 1
else:
num_rows = array.shape[0]
else:
raise ValueError
return num_rows, ndim