forked from lancedb/lancedb
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbase.py
More file actions
181 lines (149 loc) · 5.67 KB
/
base.py
File metadata and controls
181 lines (149 loc) · 5.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from abc import ABC, abstractmethod
from typing import List, Union
import numpy as np
import pyarrow as pa
from pydantic import BaseModel, Field, PrivateAttr
from .utils import TEXT, retry_with_exponential_backoff
class EmbeddingFunction(BaseModel, ABC):
"""
An ABC for embedding functions.
All concrete embedding functions must implement the following:
1. compute_query_embeddings() which takes a query and returns a list of embeddings
2. get_source_embeddings() which returns a list of embeddings for the source column
For text data, the two will be the same. For multi-modal data, the source column
might be images and the vector column might be text.
3. ndims method which returns the number of dimensions of the vector column
"""
__slots__ = ("__weakref__",) # pydantic 1.x compatibility
max_retries: int = (
7 # Setting 0 disables retires. Maybe this should not be enabled by default,
)
_ndims: int = PrivateAttr()
@classmethod
def create(cls, **kwargs):
"""
Create an instance of the embedding function
"""
return cls(**kwargs)
@abstractmethod
def compute_query_embeddings(self, *args, **kwargs) -> list[Union[np.array, None]]:
"""
Compute the embeddings for a given user query
Returns
-------
A list of embeddings for each input. The embedding of each input can be None
when the embedding is not valid.
"""
pass
@abstractmethod
def compute_source_embeddings(self, *args, **kwargs) -> list[Union[np.array, None]]:
"""Compute the embeddings for the source column in the database
Returns
-------
A list of embeddings for each input. The embedding of each input can be None
when the embedding is not valid.
"""
pass
def compute_query_embeddings_with_retry(
self, *args, **kwargs
) -> list[Union[np.array, None]]:
"""Compute the embeddings for a given user query with retries
Returns
-------
A list of embeddings for each input. The embedding of each input can be None
when the embedding is not valid.
"""
return retry_with_exponential_backoff(
self.compute_query_embeddings, max_retries=self.max_retries
)(
*args,
**kwargs,
)
def compute_source_embeddings_with_retry(
self, *args, **kwargs
) -> list[Union[np.array, None]]:
"""Compute the embeddings for the source column in the database with retries.
Returns
-------
A list of embeddings for each input. The embedding of each input can be None
when the embedding is not valid.
"""
return retry_with_exponential_backoff(
self.compute_source_embeddings, max_retries=self.max_retries
)(*args, **kwargs)
def sanitize_input(self, texts: TEXT) -> Union[List[str], np.ndarray]:
"""
Sanitize the input to the embedding function.
"""
if isinstance(texts, str):
texts = [texts]
elif isinstance(texts, pa.Array):
texts = texts.to_pylist()
elif isinstance(texts, pa.ChunkedArray):
texts = texts.combine_chunks().to_pylist()
return texts
def safe_model_dump(self):
from ..pydantic import PYDANTIC_VERSION
if PYDANTIC_VERSION.major < 2:
return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
return self.model_dump(
exclude={
field_name
for field_name in self.model_fields
if field_name.startswith("_")
}
)
@abstractmethod
def ndims(self):
"""
Return the dimensions of the vector column
"""
pass
def SourceField(self, **kwargs):
"""
Creates a pydantic Field that can automatically annotate
the source column for this embedding function
"""
return Field(json_schema_extra={"source_column_for": self}, **kwargs)
def VectorField(self, **kwargs):
"""
Creates a pydantic Field that can automatically annotate
the target vector column for this embedding function
"""
return Field(json_schema_extra={"vector_column_for": self}, **kwargs)
def __eq__(self, __value: object) -> bool:
if not hasattr(__value, "__dict__"):
return False
return vars(self) == vars(__value)
def __hash__(self) -> int:
return hash(frozenset(vars(self).items()))
class EmbeddingFunctionConfig(BaseModel):
"""
This model encapsulates the configuration for a embedding function
in a lancedb table. It holds the embedding function, the source column,
and the vector column
"""
vector_column: str
source_column: str
function: EmbeddingFunction
class TextEmbeddingFunction(EmbeddingFunction):
"""
A callable ABC for embedding functions that take text as input
"""
def compute_query_embeddings(
self, query: str, *args, **kwargs
) -> list[Union[np.array, None]]:
return self.compute_source_embeddings(query, *args, **kwargs)
def compute_source_embeddings(
self, texts: TEXT, *args, **kwargs
) -> list[Union[np.array, None]]:
texts = self.sanitize_input(texts)
return self.generate_embeddings(texts)
@abstractmethod
def generate_embeddings(
self, texts: Union[List[str], np.ndarray], *args, **kwargs
) -> list[Union[np.array, None]]:
"""Generate the embeddings for the given texts"""
pass