Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
1addc11
refactor: make abstract tensor parametrization modular
JohannesMessner Nov 29, 2022
b0b8497
feat: add type for torch embedding
JohannesMessner Nov 29, 2022
68f3914
feat: embedding type for ndarray
JohannesMessner Nov 29, 2022
953b690
fix: fix general embedding type
JohannesMessner Nov 29, 2022
766f6b9
test: update tests
JohannesMessner Nov 30, 2022
9dface7
Merge remote-tracking branch 'origin/feat-rewrite-v2' into feat-rewri…
JohannesMessner Nov 30, 2022
ed328e8
Merge remote-tracking branch 'origin/feat-rewrite-v2' into feat-rewri…
JohannesMessner Dec 8, 2022
cd7087b
feat: find function
JohannesMessner Dec 9, 2022
9e950e1
feat: batched query input
JohannesMessner Dec 12, 2022
e3d26e6
fix: fix metrics and add tests for them
JohannesMessner Dec 13, 2022
1ed7d93
test: add tests for find
JohannesMessner Dec 13, 2022
0f7068f
test: add test for topk
JohannesMessner Dec 13, 2022
d5c9ef9
feat: add batched find
JohannesMessner Dec 13, 2022
11cb7f2
test: add more tests
JohannesMessner Dec 13, 2022
ba56987
docs: improve docstrings
JohannesMessner Dec 13, 2022
840ef79
docs: improve docstrings
JohannesMessner Dec 13, 2022
6e48cdb
fix: mypy and some comments
JohannesMessner Dec 14, 2022
8dc40d5
Merge branch 'feat-rewrite-v2' into feat-find
JohannesMessner Dec 14, 2022
f28cf79
refactor: add computational backends
JohannesMessner Dec 14, 2022
0177836
refactor: use comp backends in find function
JohannesMessner Dec 14, 2022
fe85baa
Merge branch 'feat-rewrite-v2' into refactor-comp-backends
JohannesMessner Dec 14, 2022
3921ac8
refactor: clean up file structure
JohannesMessner Dec 14, 2022
4597115
fix: some typing issues
JohannesMessner Dec 14, 2022
d61b765
Merge branch 'feat-rewrite-v2' into refactor-comp-backends
JohannesMessner Dec 14, 2022
4a6e1b3
fix: reduce mypy errors
JohannesMessner Dec 15, 2022
650376f
fix: mypy
JohannesMessner Dec 15, 2022
73a4358
fix: mypy
JohannesMessner Dec 15, 2022
4b50d0f
refactor: remove is_tensor flag
JohannesMessner Dec 16, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docarray/computation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from docarray.computation.abstract_comp_backend import AbstractComputationalBackend

__all__ = ['AbstractComputationalBackend']
134 changes: 134 additions & 0 deletions docarray/computation/abstract_comp_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import typing
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple, TypeVar, Union

# In practice all of the below will be the same type
TTensor = TypeVar('TTensor')
TTensorRetrieval = TypeVar('TTensorRetrieval')
TTensorMetrics = TypeVar('TTensorMetrics')


class AbstractComputationalBackend(ABC, typing.Generic[TTensor]):
"""
Abstract base class for computational backends.
Every supported tensor/ML framework (numpy, torch etc.) should define its own
computational backend exposing common functionality expressed in that framework.
That way, DocArray can leverage native implementations from all frameworks.
"""

@staticmethod
@abstractmethod
def stack(
tensors: Union[List['TTensor'], Tuple['TTensor']], dim: int = 0
) -> 'TTensor':
"""
Stack a list of tensors along a new axis.
"""
...

class Retrieval(ABC, typing.Generic[TTensorRetrieval]):
"""
Abstract class for retrieval and ranking functionalities
"""

@staticmethod
@abstractmethod
def top_k(
values: 'TTensorRetrieval',
k: int,
descending: bool = False,
device: Optional[str] = None,
) -> Tuple['TTensorRetrieval', 'TTensorRetrieval']:
"""
Retrieves the top k smallest values in `values`,
and returns them alongside their indices in the input `values`.
Can also be used to retrieve the top k largest values,
by setting the `descending` flag to True.

:param values: Tensor of values to rank.
Should be of shape (n_queries, n_values_per_query).
Inputs of shape (n_values_per_query,) will be expanded
to (1, n_values_per_query).
:param k: number of values to retrieve
:param descending: retrieve largest values instead of smallest values
:param device: the computational device to use.
:return: Tuple containing the retrieved values, and their indices.
Both ar of shape (n_queries, k)
"""
...

class Metrics(ABC, typing.Generic[TTensorMetrics]):
"""
Abstract base class for metrics (distances and similarities).
"""

@staticmethod
@abstractmethod
def cosine_sim(
x_mat: 'TTensorMetrics',
y_mat: 'TTensorMetrics',
eps: float = 1e-7,
device: Optional[str] = None,
) -> 'TTensorMetrics':
"""Pairwise cosine similarities between all vectors in x_mat and y_mat.

:param x_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the
number of vectors and n_dim is the number of dimensions of each example.
:param y_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the
number of vectors and n_dim is the number of dimensions of each example.
:param eps: a small jitter to avoid divde by zero
:param device: the device to use for computations.
If not provided, the devices of x_mat and y_mat are used.
:return: Tensor of shape (n_vectors, n_vectors) containing all pairwise
cosine distances.
The index [i_x, i_y] contains the cosine distance between
x_mat[i_x] and y_mat[i_y].
"""
...

@staticmethod
@abstractmethod
def euclidean_dist(
x_mat: 'TTensorMetrics',
y_mat: 'TTensorMetrics',
device: Optional[str] = None,
) -> 'TTensorMetrics':
"""Pairwise Euclidian distances between all vectors in x_mat and y_mat.

:param x_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the
number of vectors and n_dim is the number of dimensions of each example.
:param y_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the
number of vectors and n_dim is the number of dimensions of each example.
:param device: the device to use for pytorch computations.
If not provided, the devices of x_mat and y_mat are used.
:return: Tensor of shape (n_vectors, n_vectors) containing all pairwise
euclidian distances.
The index [i_x, i_y] contains the euclidian distance between
x_mat[i_x] and y_mat[i_y].
"""
...

@staticmethod
@abstractmethod
def sqeuclidean_dist(
x_mat: 'TTensorMetrics',
y_mat: 'TTensorMetrics',
device: Optional[str] = None,
) -> 'TTensorMetrics':
"""Pairwise Squared Euclidian distances between all vectors
in x_mat and y_mat.

:param x_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the
number of vectors and n_dim is the number of dimensions of each
example.
:param y_mat: tensor of shape (n_vectors, n_dim), where n_vectors is the
number of vectors and n_dim is the number of dimensions of each
example.
:param device: the device to use for pytorch computations.
If not provided, the devices of x_mat and y_mat are used.
:return: Tensor of shape (n_vectors, n_vectors) containing all pairwise
euclidian distances.
The index [i_x, i_y] contains the euclidian distance between
x_mat[i_x] and y_mat[i_y].
"""
...
204 changes: 204 additions & 0 deletions docarray/computation/numpy_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import warnings
from typing import List, Optional, Tuple, Union

import numpy as np

from docarray.computation import AbstractComputationalBackend


def _expand_if_single_axis(*matrices: np.ndarray) -> List[np.ndarray]:
"""Expands arrays that only have one axis, at dim 0.
This ensures that all outputs can be treated as matrices, not vectors.

:param matrices: Matrices to be expanded
:return: List of the input matrices,
where single axis matrices are expanded at dim 0.
"""
expanded = []
for m in matrices:
if len(m.shape) == 1:
expanded.append(np.expand_dims(m, axis=0))
else:
expanded.append(m)
return expanded


def _expand_if_scalar(arr: np.ndarray) -> np.ndarray:
if len(arr.shape) == 0: # avoid scalar output
arr = np.expand_dims(arr, axis=0)
return arr


class NumpyCompBackend(AbstractComputationalBackend[np.ndarray]):
"""
Computational backend for Numpy.
"""

@staticmethod
def stack(
tensors: Union[List['np.ndarray'], Tuple['np.ndarray']], dim: int = 0
) -> 'np.ndarray':
return np.stack(tensors, axis=dim)

class Retrieval(AbstractComputationalBackend.Retrieval[np.ndarray]):
"""
Abstract class for retrieval and ranking functionalities
"""

@staticmethod
def top_k(
values: 'np.ndarray',
k: int,
descending: bool = False,
device: Optional[str] = None,
) -> Tuple['np.ndarray', 'np.ndarray']:
"""
Retrieves the top k smallest values in `values`,
and returns them alongside their indices in the input `values`.
Can also be used to retrieve the top k largest values,
by setting the `descending` flag.

:param values: Torch tensor of values to rank.
Should be of shape (n_queries, n_values_per_query).
Inputs of shape (n_values_per_query,) will be expanded
to (1, n_values_per_query).
:param k: number of values to retrieve
:param descending: retrieve largest values instead of smallest values
:param device: Not supported for this backend
:return: Tuple containing the retrieved values, and their indices.
Both ar of shape (n_queries, k)
"""
if device is not None:
warnings.warn('`device` is not supported for numpy operations')

if len(values.shape) == 1:
values = np.expand_dims(values, axis=0)

if descending:
values = -values

if k >= values.shape[1]:
idx = values.argsort(axis=1)[:, :k]
values = np.take_along_axis(values, idx, axis=1)
else:
idx_ps = values.argpartition(kth=k, axis=1)[:, :k]
values = np.take_along_axis(values, idx_ps, axis=1)
idx_fs = values.argsort(axis=1)
idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
values = np.take_along_axis(values, idx_fs, axis=1)

if descending:
values = -values

return values, idx

class Metrics(AbstractComputationalBackend.Metrics[np.ndarray]):
"""
Abstract base class for metrics (distances and similarities).
"""

@staticmethod
def cosine_sim(
x_mat: np.ndarray,
y_mat: np.ndarray,
eps: float = 1e-7,
device: Optional[str] = None,
) -> np.ndarray:
"""Pairwise cosine similarities between all vectors in x_mat and y_mat.

:param x_mat: np.ndarray of shape (n_vectors, n_dim), where n_vectors is
the number of vectors and n_dim is the number of dimensions of each
example.
:param y_mat: np.ndarray of shape (n_vectors, n_dim), where n_vectors is
the number of vectors and n_dim is the number of dimensions of each
example.
:param eps: a small jitter to avoid divde by zero
:param device: Not supported for this backend
:return: np.ndarray of shape (n_vectors, n_vectors) containing all
pairwise cosine distances.
The index [i_x, i_y] contains the cosine distance between
x_mat[i_x] and y_mat[i_y].
"""
if device is not None:
warnings.warn('`device` is not supported for numpy operations')

x_mat, y_mat = _expand_if_single_axis(x_mat, y_mat)

sims = np.clip(
(np.dot(x_mat, y_mat.T) + eps)
/ (
np.outer(
np.linalg.norm(x_mat, axis=1), np.linalg.norm(y_mat, axis=1)
)
+ eps
),
-1,
1,
).squeeze()
return _expand_if_scalar(sims)

@classmethod
def euclidean_dist(
cls, x_mat: np.ndarray, y_mat: np.ndarray, device: Optional[str] = None
) -> np.ndarray:
"""Pairwise Euclidian distances between all vectors in x_mat and y_mat.

:param x_mat: np.ndarray of shape (n_vectors, n_dim), where n_vectors is
the number of vectors and n_dim is the number of dimensions of each
example.
:param y_mat: np.ndarray of shape (n_vectors, n_dim), where n_vectors is
the number of vectors and n_dim is the number of dimensions of each
example.
:param eps: a small jitter to avoid divde by zero
:param device: Not supported for this backend
:return: np.ndarray of shape (n_vectors, n_vectors) containing all
pairwise euclidian distances.
The index [i_x, i_y] contains the euclidian distance between
x_mat[i_x] and y_mat[i_y].
"""
if device is not None:
warnings.warn('`device` is not supported for numpy operations')

x_mat, y_mat = _expand_if_single_axis(x_mat, y_mat)

return _expand_if_scalar(
np.sqrt(cls.sqeuclidean_dist(x_mat, y_mat)).squeeze()
)

@staticmethod
def sqeuclidean_dist(
x_mat: np.ndarray,
y_mat: np.ndarray,
device: Optional[str] = None,
) -> np.ndarray:
"""Pairwise Squared Euclidian distances between all vectors in
x_mat and y_mat.

:param x_mat: np.ndarray of shape (n_vectors, n_dim), where n_vectors is
the number of vectors and n_dim is the number of dimensions of each
example.
:param y_mat: np.ndarray of shape (n_vectors, n_dim), where n_vectors is
the number of vectors and n_dim is the number of dimensions of each
example.
:param device: Not supported for this backend
:return: np.ndarray of shape (n_vectors, n_vectors) containing all
pairwise Squared Euclidian distances.
The index [i_x, i_y] contains the cosine Squared Euclidian between
x_mat[i_x] and y_mat[i_y].
"""
eps: float = 1e-7 # avoid problems with numerical inaccuracies

if device is not None:
warnings.warn('`device` is not supported for numpy operations')

x_mat, y_mat = _expand_if_single_axis(x_mat, y_mat)

dists = (
np.sum(y_mat**2, axis=1)
+ np.sum(x_mat**2, axis=1)[:, np.newaxis]
- 2 * np.dot(x_mat, y_mat.T)
).squeeze()

# remove numerical artifacts
dists = np.where(np.logical_and(dists < 0, dists > -eps), 0, dists)
return _expand_if_scalar(dists)
Loading