forked from lancedb/lancedb
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_guide_index.py
More file actions
169 lines (152 loc) · 5.78 KB
/
test_guide_index.py
File metadata and controls
169 lines (152 loc) · 5.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# --8<-- [start:import-lancedb]
import lancedb
# --8<-- [end:import-lancedb]
# --8<-- [start:import-lancedb-ivfpq]
from lancedb.index import IvfPq
# --8<-- [end:import-lancedb-ivfpq]
# --8<-- [start:import-lancedb-btree-bitmap]
from lancedb.index import BTree, Bitmap
# --8<-- [end:import-lancedb-btree-bitmap]
# --8<-- [start:import-numpy]
import numpy as np
# --8<-- [end:import-numpy]
import pytest
def test_ann_index():
# --8<-- [start:create_ann_index]
uri = "data/sample-lancedb"
# Create 5,000 sample vectors
data = [
{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((5_000, 32)).astype("float32"))
]
db = lancedb.connect(uri)
# Add the vectors to a table
tbl = db.create_table("my_vectors", data=data)
# Create and train the index - you need to have enough data in the table
# for an effective training step
tbl.create_index(num_partitions=2, num_sub_vectors=4)
# --8<-- [end:create_ann_index]
# --8<-- [start:vector_search]
tbl.search(np.random.random((32))).limit(2).nprobes(20).refine_factor(
10
).to_pandas()
# --8<-- [end:vector_search]
# --8<-- [start:vector_search_with_filter]
tbl.search(np.random.random((32))).where("item != 'item 1141'").to_pandas()
# --8<-- [end:vector_search_with_filter]
# --8<-- [start:vector_search_with_select]
tbl.search(np.random.random((32))).select(["vector"]).to_pandas()
# --8<-- [end:vector_search_with_select]
@pytest.mark.asyncio
async def test_ann_index_async():
# --8<-- [start:create_ann_index_async]
uri = "data/sample-lancedb"
# Create 5,000 sample vectors
data = [
{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((5_000, 32)).astype("float32"))
]
async_db = await lancedb.connect_async(uri)
# Add the vectors to a table
async_tbl = await async_db.create_table("my_vectors_async", data=data)
# Create and train the index - you need to have enough data in the table
# for an effective training step
await async_tbl.create_index(
"vector", config=IvfPq(num_partitions=2, num_sub_vectors=4)
)
# --8<-- [end:create_ann_index_async]
# --8<-- [start:vector_search_async]
await (
async_tbl.query()
.nearest_to(np.random.random((32)))
.limit(2)
.nprobes(20)
.refine_factor(10)
.to_pandas()
)
# --8<-- [end:vector_search_async]
# --8<-- [start:vector_search_async_with_filter]
await (
async_tbl.query()
.nearest_to(np.random.random((32)))
.where("item != 'item 1141'")
.to_pandas()
)
# --8<-- [end:vector_search_async_with_filter]
# --8<-- [start:vector_search_async_with_select]
await (
async_tbl.query()
.nearest_to(np.random.random((32)))
.select(["vector"])
.to_pandas()
)
# --8<-- [end:vector_search_async_with_select]
def test_scalar_index():
# --8<-- [start:basic_scalar_index]
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
books = [
{
"book_id": 1,
"publisher": "plenty of books",
"tags": ["fantasy", "adventure"],
},
{"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]},
{"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]},
]
table = db.create_table("books", books)
table.create_scalar_index("book_id") # BTree by default
table.create_scalar_index("publisher", index_type="BITMAP")
# --8<-- [end:basic_scalar_index]
# --8<-- [start:search_with_scalar_index]
table = db.open_table("books")
table.search().where("book_id = 2").to_pandas()
# --8<-- [end:search_with_scalar_index]
# --8<-- [start:vector_search_with_scalar_index]
data = [
{"book_id": 1, "vector": [1.0, 2]},
{"book_id": 2, "vector": [3.0, 4]},
{"book_id": 3, "vector": [5.0, 6]},
]
table = db.create_table("book_with_embeddings", data)
(table.search([1, 2]).where("book_id != 3", prefilter=True).to_pandas())
# --8<-- [end:vector_search_with_scalar_index]
# --8<-- [start:update_scalar_index]
table.add([{"vector": [7, 8], "book_id": 4}])
table.optimize()
# --8<-- [end:update_scalar_index]
@pytest.mark.asyncio
async def test_scalar_index_async():
# --8<-- [start:basic_scalar_index_async]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)
books = [
{
"book_id": 1,
"publisher": "plenty of books",
"tags": ["fantasy", "adventure"],
},
{"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]},
{"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]},
]
async_tbl = await async_db.create_table("books_async", books)
await async_tbl.create_index("book_id", config=BTree()) # BTree by default
await async_tbl.create_index("publisher", config=Bitmap())
# --8<-- [end:basic_scalar_index_async]
# --8<-- [start:search_with_scalar_index_async]
async_tbl = await async_db.open_table("books_async")
await async_tbl.query().where("book_id = 2").to_pandas()
# --8<-- [end:search_with_scalar_index_async]
# --8<-- [start:vector_search_with_scalar_index_async]
data = [
{"book_id": 1, "vector": [1.0, 2]},
{"book_id": 2, "vector": [3.0, 4]},
{"book_id": 3, "vector": [5.0, 6]},
]
async_tbl = await async_db.create_table("book_with_embeddings_async", data)
(await async_tbl.query().where("book_id != 3").nearest_to([1, 2]).to_pandas())
# --8<-- [end:vector_search_with_scalar_index_async]
# --8<-- [start:update_scalar_index_async]
await async_tbl.add([{"vector": [7, 8], "book_id": 4}])
await async_tbl.optimize()
# --8<-- [end:update_scalar_index_async]