docarray/docarray/math/ndarray.py at main · generall/docarray

History

194 lines (148 loc) · 5.72 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

from typing import TYPE_CHECKING, Tuple, Sequence, Optional, List

import numpy as np

if TYPE_CHECKING:

from ..types import ArrayType

from .. import Document, DocumentArray

def unravel(docs: Sequence['Document'], field: str) -> Optional['ArrayType']:

_first = getattr(docs[0], field)

if _first is None:

# failed to unravel, return as a list

r = [getattr(d, field) for d in docs]

if any(_rr is not None for _rr in r):

return r

else:

return None

framework, is_sparse = get_array_type(_first)

all_fields = [getattr(d, field) for d in docs]

cls_type = type(_first)

if framework == 'python':

return cls_type(all_fields)

elif framework == 'numpy':

return np.stack(all_fields)

elif framework == 'tensorflow':

import tensorflow as tf

return tf.stack(all_fields)

elif framework == 'torch':

import torch

return torch.stack(all_fields)

elif framework == 'paddle':

import paddle

return paddle.stack(all_fields)

elif framework == 'scipy':

import scipy.sparse

return cls_type(scipy.sparse.vstack(all_fields))

def ravel(value: 'ArrayType', docs: 'DocumentArray', field: str) -> None:

"""Ravel :attr:`value` into ``doc.field`` of each documents

:param docs: the docs to set

:param field: the field of the doc to set

:param value: the value to be set on ``doc.field``

"""

use_get_row = False

if hasattr(value, 'getformat'):

# for scipy only

sp_format = value.getformat()

if sp_format in {'bsr', 'coo'}:

# for BSR and COO, they dont implement [j, ...] in scipy

# but they offer get_row() API which implicitly translate the

# sparse row into CSR format, hence needs to convert back

# not very efficient, but this is the best we can do.

use_get_row = True

if use_get_row:

emb_shape0 = value.shape[0]

for d, j in zip(docs, range(emb_shape0)):

row = getattr(value.getrow(j), f'to{sp_format}')()

docs[d.id, field] = row

elif isinstance(value, (list, tuple)):

for d, j in zip(docs, value):

docs[d.id, field] = j

else:

emb_shape0 = value.shape[0]

for d, j in zip(docs, range(emb_shape0)):

docs[d.id, field] = value[j, ...]

def get_array_type(array: 'ArrayType') -> Tuple[str, bool]:

"""Get the type of ndarray without importing the framework

:param array: any array, scipy, numpy, tf, torch, etc.

:return: a tuple where the first element represents the framework, the second represents if it is sparse array

"""

module_tags = array.__class__.__module__.split('.')

class_name = array.__class__.__name__

if isinstance(array, (list, tuple)):

return 'python', False

if 'numpy' in module_tags:

return 'numpy', False

if 'docarray' in module_tags:

if class_name == 'NdArray':

return 'docarray', False # sparse or not is irrelevant

if 'docarray_pb2' in module_tags:

if class_name == 'NdArrayProto':

return 'docarray_proto', False # sparse or not is irrelevant

if 'tensorflow' in module_tags:

if class_name == 'SparseTensor':

return 'tensorflow', True

if class_name == 'Tensor' or class_name == 'EagerTensor':

return 'tensorflow', False

if 'torch' in module_tags and class_name == 'Tensor':

return 'torch', array.is_sparse

if 'paddle' in module_tags and class_name == 'Tensor':

# Paddle does not support sparse tensor on 11/8/2021

# https://github.com/PaddlePaddle/Paddle/issues/36697

return 'paddle', False

if 'scipy' in module_tags and 'sparse' in module_tags:

return 'scipy', True

raise TypeError(f'can not determine the array type: {module_tags}.{class_name}')

def to_numpy_array(value) -> 'np.ndarray':

"""Return the value always in :class:`numpy.ndarray` regardless the framework type.

:return: the value in :class:`numpy.ndarray`.

"""

v = value

framework, is_sparse = get_array_type(value)

if is_sparse:

if hasattr(v, 'todense'):

v = v.todense()

elif hasattr(v, 'to_dense'):

v = v.to_dense()

elif framework == 'tensorflow':

import tensorflow as tf

if isinstance(v, tf.SparseTensor):

v = tf.sparse.to_dense(v)

if hasattr(v, 'numpy'):

v = v.numpy()

if framework == 'python':

v = np.array(v)

return v

def to_list(value) -> List[float]:

r = to_numpy_array(value)

if isinstance(r, np.ndarray):

return r.tolist()

elif isinstance(r, list):

return r

else:

raise TypeError(f'{r} can not be converted into list')

def get_array_rows(array: 'ArrayType') -> Tuple[int, int]:

"""Get the number of rows of the ndarray without importing all frameworks

:param array: input array

:return: (num_rows, ndim)

Examples

>>> get_array_rows([1,2,3])

1, 1

>>> get_array_rows([[1,2,3], [4,5,6]])

2, 2

>>> get_array_rows([[1,2,3], [4,5,6], [7,8,9]])

3, 2

>>> get_array_rows(np.array([[1,2,3], [4,5,6], [7,8,9]]))

3, 2

"""

array_type, _ = get_array_type(array)

if array_type == 'python':

first_element_list_like = isinstance(array[0], (list, tuple))

num_rows = len(array) if first_element_list_like else 1

ndim = 2 if first_element_list_like else 1

elif array_type in ('numpy', 'tensorflow', 'torch', 'paddle', 'scipy'):

ndim = array.ndim

if ndim == 1:

num_rows = 1

else:

num_rows = array.shape[0]

else:

raise ValueError

return num_rows, ndim

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

ndarray.py

Latest commit

History

ndarray.py

File metadata and controls