forked from lancedb/lancedb
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathopenai.py
More file actions
133 lines (111 loc) · 3.82 KB
/
openai.py
File metadata and controls
133 lines (111 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from functools import cached_property
from typing import TYPE_CHECKING, List, Optional, Union
import logging
from ..util import attempt_import_or_raise
from .base import TextEmbeddingFunction
from .registry import register
if TYPE_CHECKING:
import numpy as np
@register("openai")
class OpenAIEmbeddings(TextEmbeddingFunction):
"""
An embedding function that uses the OpenAI API
https://platform.openai.com/docs/guides/embeddings
This can also be used for open source models that
are compatible with the OpenAI API.
Notes
-----
If you're running an Ollama server locally,
you can just override the `base_url` parameter
and provide the Ollama embedding model you want
to use (https://ollama.com/library):
```python
from lancedb.embeddings import get_registry
openai = get_registry().get("openai")
embedding_function = openai.create(
name="<ollama-embedding-model-name>",
base_url="http://localhost:11434",
)
```
"""
name: str = "text-embedding-ada-002"
dim: Optional[int] = None
base_url: Optional[str] = None
default_headers: Optional[dict] = None
organization: Optional[str] = None
api_key: Optional[str] = None
# Set true to use Azure OpenAI API
use_azure: bool = False
def ndims(self):
return self._ndims
@staticmethod
def model_names():
return [
"text-embedding-ada-002",
"text-embedding-3-large",
"text-embedding-3-small",
]
@cached_property
def _ndims(self):
if self.name == "text-embedding-ada-002":
return 1536
elif self.name == "text-embedding-3-large":
return self.dim or 3072
elif self.name == "text-embedding-3-small":
return self.dim or 1536
else:
raise ValueError(f"Unknown model name {self.name}")
def generate_embeddings(
self, texts: Union[List[str], "np.ndarray"]
) -> List["np.array"]:
"""
Get the embeddings for the given texts
Parameters
----------
texts: list[str] or np.ndarray (of str)
The texts to embed
"""
openai = attempt_import_or_raise("openai")
valid_texts = []
valid_indices = []
for idx, text in enumerate(texts):
if text:
valid_texts.append(text)
valid_indices.append(idx)
# TODO retry, rate limit, token limit
try:
kwargs = {
"input": valid_texts,
"model": self.name,
}
if self.name != "text-embedding-ada-002":
kwargs["dimensions"] = self.dim
rs = self._openai_client.embeddings.create(**kwargs)
valid_embeddings = {
idx: v.embedding for v, idx in zip(rs.data, valid_indices)
}
except openai.BadRequestError:
logging.exception("Bad request: %s", texts)
return [None] * len(texts)
except Exception:
logging.exception("OpenAI embeddings error")
raise
return [valid_embeddings.get(idx, None) for idx in range(len(texts))]
@cached_property
def _openai_client(self):
openai = attempt_import_or_raise("openai")
kwargs = {}
if self.base_url:
kwargs["base_url"] = self.base_url
if self.default_headers:
kwargs["default_headers"] = self.default_headers
if self.organization:
kwargs["organization"] = self.organization
if self.api_key:
kwargs["api_key"] = self.api_key
if self.use_azure:
return openai.AzureOpenAI(**kwargs)
else:
return openai.OpenAI(**kwargs)