forked from docarray/docarray
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfind.py
More file actions
117 lines (96 loc) · 3.59 KB
/
find.py
File metadata and controls
117 lines (96 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from typing import (
TYPE_CHECKING,
TypeVar,
Sequence,
List,
Union,
)
import numpy as np
from .... import Document, DocumentArray
from ....math import ndarray
from ....math.helper import EPSILON
from ....math.ndarray import to_numpy_array
from ....score import NamedScore
from ....array.mixins.find import FindMixin as BaseFindMixin
if TYPE_CHECKING:
import tensorflow
import torch
ElasticArrayType = TypeVar(
'ElasticArrayType',
np.ndarray,
tensorflow.Tensor,
torch.Tensor,
Sequence[float],
)
class FindMixin(BaseFindMixin):
def _find_similar_vectors(self, query: 'ElasticArrayType', limit=10):
query = to_numpy_array(query)
is_all_zero = np.all(query == 0)
if is_all_zero:
query = query + EPSILON
resp = self._client.knn_search(
index=self._config.index_name,
knn={
'field': 'embedding',
'query_vector': query,
'k': limit,
'num_candidates': 10000,
},
)
list_of_hits = resp['hits']['hits']
da = DocumentArray()
for result in list_of_hits:
doc = Document.from_base64(result['_source']['blob'])
doc.scores['score'] = NamedScore(value=result['_score'])
doc.embedding = result['_source']['embedding']
da.append(doc)
return da
def _find_similar_documents_from_text(
self, query: str, index: str = 'text', limit: int = 10
):
"""
Return keyword matches for the input query
:param query: text used for keyword search
:param limit: number of items to be retrieved
:return: DocumentArray containing the closest documents to the query if it is a single query, otherwise a list of DocumentArrays containing
the closest Document objects for each of the queries in `query`.
"""
resp = self._client.search(
index=self._config.index_name,
query={'match': {index: query}},
source=['id', 'blob', 'text'],
size=limit,
)
list_of_hits = resp['hits']['hits']
da = DocumentArray()
for result in list_of_hits[:limit]:
doc = Document.from_base64(result['_source']['blob'])
doc.scores['score'] = NamedScore(value=result['_score'])
da.append(doc)
return da
def _find_by_text(
self, query: Union[str, List[str]], index: str = 'text', limit: int = 10
):
if isinstance(query, str):
query = [query]
return [
self._find_similar_documents_from_text(q, index=index, limit=limit)
for q in query
]
def _find(
self,
query: 'ElasticArrayType',
limit: int = 10,
**kwargs,
) -> List['DocumentArray']:
"""Returns approximate nearest neighbors given a batch of input queries.
:param query: input supported to be stored in Elastic. This includes any from the list '[np.ndarray, tensorflow.Tensor, torch.Tensor, Sequence[float]]'
:param limit: number of retrieved items
:return: DocumentArray containing the closest documents to the query if it is a single query, otherwise a list of DocumentArrays containing
the closest Document objects for each of the queries in `query`.
"""
query = np.array(query)
num_rows, n_dim = ndarray.get_array_rows(query)
if n_dim != 2:
query = query.reshape((num_rows, -1))
return [self._find_similar_vectors(q, limit=limit) for q in query]