lancedb/python/python/lancedb/table.py at main · zebin-code/lancedb

History

3336 lines (2864 loc) · 117 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

# SPDX-License-Identifier: Apache-2.0

# SPDX-FileCopyrightText: Copyright The LanceDB Authors

from __future__ import annotations

import inspect

from abc import ABC, abstractmethod

from dataclasses import dataclass

from datetime import datetime, timedelta

from functools import cached_property

from typing import (

TYPE_CHECKING,

Any,

Dict,

Iterable,

List,

Literal,

Optional,

Tuple,

Union,

overload,

)

from urllib.parse import urlparse

import lance

from lancedb.background_loop import LOOP

from .dependencies import _check_for_pandas

import pyarrow as pa

import pyarrow.compute as pc

import pyarrow.fs as pa_fs

from lance import LanceDataset

from lance.dependencies import _check_for_hugging_face

from .common import DATA, VEC, VECTOR_COLUMN_NAME

from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry

from .index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS

from .merge import LanceMergeInsertBuilder

from .pydantic import LanceModel, model_to_dict

from .query import (

AsyncQuery,

AsyncVectorQuery,

LanceEmptyQueryBuilder,

LanceFtsQueryBuilder,

LanceHybridQueryBuilder,

LanceQueryBuilder,

LanceVectorQueryBuilder,

Query,

)

from .util import (

add_note,

fs_from_uri,

get_uri_scheme,

infer_vector_column_name,

join_uri,

safe_import_pandas,

safe_import_polars,

value_to_sql,

)

from .index import lang_mapping

if TYPE_CHECKING:

from ._lancedb import Table as LanceDBTable, OptimizeStats, CompactionStats

from .db import LanceDBConnection

from .index import IndexConfig

from lance.dataset import CleanupStats, ReaderLike

import pandas

import PIL

pd = safe_import_pandas()

pl = safe_import_polars()

QueryType = Literal["vector", "fts", "hybrid", "auto"]

def _into_pyarrow_table(data) -> pa.Table:

if _check_for_hugging_face(data):

# Huggingface datasets

from lance.dependencies import datasets

if isinstance(data, datasets.Dataset):

schema = data.features.arrow_schema

return pa.Table.from_batches(data.data.to_batches(), schema=schema)

elif isinstance(data, datasets.dataset_dict.DatasetDict):

schema = _schema_from_hf(data, schema)

return pa.Table.from_batches(_to_batches_with_split(data), schema=schema)

if isinstance(data, LanceModel):

raise ValueError("Cannot add a single LanceModel to a table. Use a list.")

if isinstance(data, dict):

raise ValueError("Cannot add a single dictionary to a table. Use a list.")

if isinstance(data, list):

# convert to list of dict if data is a bunch of LanceModels

if isinstance(data[0], LanceModel):

schema = data[0].__class__.to_arrow_schema()

data = [model_to_dict(d) for d in data]

return pa.Table.from_pylist(data, schema=schema)

elif isinstance(data[0], pa.RecordBatch):

return pa.Table.from_batches(data)

else:

return pa.Table.from_pylist(data)

elif _check_for_pandas(data) and isinstance(data, pd.DataFrame):

table = pa.Table.from_pandas(data, preserve_index=False)

# Do not serialize Pandas metadata

meta = table.schema.metadata if table.schema.metadata is not None else {}

meta = {k: v for k, v in meta.items() if k != b"pandas"}

return table.replace_schema_metadata(meta)

elif isinstance(data, pa.Table):

return data

elif isinstance(data, pa.RecordBatch):

return pa.Table.from_batches([data])

elif isinstance(data, LanceDataset):

return data.scanner().to_table()

elif isinstance(data, pa.dataset.Dataset):

return data.to_table()

elif isinstance(data, pa.dataset.Scanner):

return data.to_table()

elif isinstance(data, pa.RecordBatchReader):

return data.read_all()

elif (

type(data).__module__.startswith("polars")

and data.__class__.__name__ == "DataFrame"

return data.to_arrow()

elif (

type(data).__module__.startswith("polars")

and data.__class__.__name__ == "LazyFrame"

return data.collect().to_arrow()

elif isinstance(data, Iterable):

return _iterator_to_table(data)

else:

raise TypeError(

f"Unknown data type {type(data)}. "

"Please check "

"https://lancedb.github.io/lancedb/python/python/ "

"to see supported types."

)

def _iterator_to_table(data: Iterable) -> pa.Table:

batches = []

schema = None # Will get schema from first batch

for batch in data:

batch_table = _into_pyarrow_table(batch)

if schema is not None:

if batch_table.schema != schema:

try:

batch_table = batch_table.cast(schema)

except pa.lib.ArrowInvalid:

raise ValueError(

f"Input iterator yielded a batch with schema that "

f"does not match the schema of other batches.\n"

f"Expected:\n{schema}\nGot:\n{batch_table.schema}"

)

else:

# Use the first schema for the remainder of the batches

schema = batch_table.schema

batches.append(batch_table)

if batches:

return pa.concat_tables(batches)

else:

raise ValueError("Input iterable is empty")

def _sanitize_data(

data: "DATA",

target_schema: Optional[pa.Schema] = None,

metadata: Optional[dict] = None, # embedding metadata

on_bad_vectors: Literal["error", "drop", "fill", "null"] = "error",

fill_value: float = 0.0,

allow_subschema: bool = False,

) -> pa.Table:

"""

Handle input data, applying all standard transformations.

This includes:

* Converting the data to a PyArrow Table

* Adding vector columns defined in the metadata

* Adding embedding metadata into the schema

* Casting the table to the target schema

* Handling bad vectors

Parameters

----------

target_schema : Optional[pa.Schema], default None

The schema to cast the table to. This is typically the schema of the table

if it already exists. Otherwise it might be a user-requested schema.

allow_subschema : bool, default False

If True, the input table is allowed to omit columns from the target schema.

The target schema will be filtered to only include columns that are present

in the input table before casting.

metadata : Optional[dict], default None

The embedding metadata to add to the schema.

on_bad_vectors : Literal["error", "drop", "fill", "null"], default "error"

What to do if any of the vectors are not the same size or contains NaNs.

fill_value : float, default 0.0

The value to use when filling vectors. Only used if on_bad_vectors="fill".

All entries in the vector will be set to this value.

"""

# At this point, the table might not match the schema we are targeting:

# 1. There might be embedding columns missing that will be added

# in the add_embeddings step.

# 2. If `allow_subschemas` is True, there might be columns missing.

table = _into_pyarrow_table(data)

table = _append_vector_columns(table, target_schema, metadata=metadata)

# This happens before the cast so we can fix vector columns with

# incorrect lengths before they are cast to FSL.

table = _handle_bad_vectors(

table,

on_bad_vectors=on_bad_vectors,

fill_value=fill_value,

)

if target_schema is None:

target_schema = _infer_target_schema(table)

if metadata:

new_metadata = target_schema.metadata or {}

new_metadata = new_metadata.update(metadata)

target_schema = target_schema.with_metadata(new_metadata)

_validate_schema(target_schema)

table = _cast_to_target_schema(table, target_schema, allow_subschema)

return table

def _cast_to_target_schema(

table: pa.Table,

target_schema: pa.Schema,

allow_subschema: bool = False,

) -> pa.Table:

# pa.Table.cast expects field order not to be changed.

# Lance doesn't care about field order, so we don't need to rearrange fields

# to match the target schema. We just need to correctly cast the fields.

if table.schema == target_schema:

# Fast path when the schemas are already the same

return table

fields = []

for field in table.schema:

target_field = target_schema.field(field.name)

if target_field is None:

raise ValueError(f"Field {field.name} not found in target schema")

fields.append(target_field)

reordered_schema = pa.schema(fields, metadata=target_schema.metadata)

if not allow_subschema and len(reordered_schema) != len(target_schema):

raise ValueError(

"Input table has different number of columns than target schema"

)

if allow_subschema and len(reordered_schema) != len(target_schema):

fields = _infer_subschema(

list(iter(table.schema)), list(iter(reordered_schema))

)

subschema = pa.schema(fields, metadata=target_schema.metadata)

return table.cast(subschema)

else:

return table.cast(reordered_schema)

def _infer_subschema(

schema: List[pa.Field],

reference_fields: List[pa.Field],

) -> List[pa.Field]:

"""

Transform the list of fields so the types match the reference_fields.

The order of the fields is preserved.

``schema`` may have fewer fields than `reference_fields`, but it may not have

more fields.

"""

fields = []

lookup = {f.name: f for f in reference_fields}

for field in schema:

reference = lookup.get(field.name)

if reference is None:

raise ValueError("Unexpected field in schema: {}".format(field))

if pa.types.is_struct(reference.type):

new_type = pa.struct(

_infer_subschema(

field.type.fields,

reference.type.fields,

)

new_field = pa.field(

field.name,

new_type,

reference.nullable,

)

else:

new_field = reference

fields.append(new_field)

return fields

def sanitize_create_table(

data,

schema: Union[pa.Schema, LanceModel],

metadata=None,

on_bad_vectors: str = "error",

fill_value: float = 0.0,

if inspect.isclass(schema) and issubclass(schema, LanceModel):

# convert LanceModel to pyarrow schema

# note that it's possible this contains

# embedding function metadata already

schema: pa.Schema = schema.to_arrow_schema()

if data is not None:

if metadata is None and schema is not None:

metadata = schema.metadata

data = _sanitize_data(

data,

schema,

metadata=metadata,

on_bad_vectors=on_bad_vectors,

fill_value=fill_value,

)

schema = data.schema

else:

if schema is not None:

data = pa.Table.from_pylist([], schema)

if schema is None:

if data is None:

raise ValueError("Either data or schema must be provided")

elif hasattr(data, "schema"):

schema = data.schema

if metadata:

schema = schema.with_metadata(metadata)

# Need to apply metadata to the data as well

data = data.replace_schema_metadata(metadata)

return data, schema

def _schema_from_hf(data, schema):

"""

Extract pyarrow schema from HuggingFace DatasetDict

and validate that they're all the same schema between

splits

"""

for dataset in data.values():

if schema is None:

schema = dataset.features.arrow_schema

elif schema != dataset.features.arrow_schema:

msg = "All datasets in a HuggingFace DatasetDict must have the same schema"

raise TypeError(msg)

return schema

def _to_batches_with_split(data):

"""

Return a generator of RecordBatches from a HuggingFace DatasetDict

with an extra `split` column

"""

for key, dataset in data.items():

for batch in dataset.data.to_batches():

table = pa.Table.from_batches([batch])

if "split" not in table.column_names:

table = table.append_column(

"split", pa.array([key] * batch.num_rows, pa.string())

)

for b in table.to_batches():

yield b

def _append_vector_columns(

data: pa.Table,

schema: Optional[pa.Schema] = None,

metadata: Optional[dict] = None,

) -> pa.Table:

"""

Use the embedding function to automatically embed the source columns and add the

vector columns to the table.

"""

if schema is None:

metadata = metadata or {}

else:

metadata = schema.metadata or metadata or {}

functions = EmbeddingFunctionRegistry.get_instance().parse_functions(metadata)

for vector_column, conf in functions.items():

func = conf.function

no_vector_column = vector_column not in data.column_names

if no_vector_column or pc.all(pc.is_null(data[vector_column])).as_py():

col_data = func.compute_source_embeddings_with_retry(

data[conf.source_column]

)

if schema is not None:

dtype = schema.field(vector_column).type

else:

dtype = pa.list_(pa.float32(), len(col_data[0]))

if no_vector_column:

data = data.append_column(

pa.field(vector_column, type=dtype), pa.array(col_data, type=dtype)

)

else:

data = data.set_column(

data.column_names.index(vector_column),

pa.field(vector_column, type=dtype),

pa.array(col_data, type=dtype),

)

return data

def _table_path(base: str, table_name: str) -> str:

"""

Get a table path that can be used in PyArrow FS.

Removes any weird schemes (such as "s3+ddb") and drops any query params.

"""

uri = _table_uri(base, table_name)

# Parse as URL

parsed = urlparse(uri)

# If scheme is s3+ddb, convert to s3

if parsed.scheme == "s3+ddb":

parsed = parsed._replace(scheme="s3")

# Remove query parameters

return parsed._replace(query=None).geturl()

def _table_uri(base: str, table_name: str) -> str:

return join_uri(base, f"{table_name}.lance")

class Table(ABC):

"""

A Table is a collection of Records in a LanceDB Database.

Examples

--------

Create using [DBConnection.create_table][lancedb.DBConnection.create_table]

(more examples in that method's documentation).

>>> import lancedb

>>> db = lancedb.connect("./.lancedb")

>>> table = db.create_table("my_table", data=[{"vector": [1.1, 1.2], "b": 2}])

>>> table.head()

pyarrow.Table

vector: fixed_size_list<item: float>[2]

child 0, item: float

b: int64

----

vector: [[[1.1,1.2]]]

b: [[2]]

Can append new data with [Table.add()][lancedb.table.Table.add].

>>> table.add([{"vector": [0.5, 1.3], "b": 4}])

Can query the table with [Table.search][lancedb.table.Table.search].

>>> table.search([0.4, 0.4]).select(["b", "vector"]).to_pandas()

b vector _distance

0 4 [0.5, 1.3] 0.82

1 2 [1.1, 1.2] 1.13

Search queries are much faster when an index is created. See

[Table.create_index][lancedb.table.Table.create_index].

"""

@property

@abstractmethod

def name(self) -> str:

"""The name of this Table"""

raise NotImplementedError

@property

@abstractmethod

def version(self) -> int:

"""The version of this Table"""

raise NotImplementedError

@property

@abstractmethod

def schema(self) -> pa.Schema:

"""The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)

of this Table

"""

raise NotImplementedError

@property

@abstractmethod

def embedding_functions(self) -> Dict[str, EmbeddingFunctionConfig]:

"""

Get a mapping from vector column name to it's configured embedding function.

"""

@abstractmethod

def count_rows(self, filter: Optional[str] = None) -> int:

"""

Count the number of rows in the table.

Parameters

----------

filter: str, optional

A SQL where clause to filter the rows to count.

"""

raise NotImplementedError

def to_pandas(self) -> "pandas.DataFrame":

"""Return the table as a pandas DataFrame.

Returns

-------

pd.DataFrame

"""

return self.to_arrow().to_pandas()

@abstractmethod

def to_arrow(self) -> pa.Table:

"""Return the table as a pyarrow Table.

Returns

-------

pa.Table

"""

raise NotImplementedError

def create_index(

self,

metric="L2",

num_partitions=256,

num_sub_vectors=96,

vector_column_name: str = VECTOR_COLUMN_NAME,

replace: bool = True,

accelerator: Optional[str] = None,

index_cache_size: Optional[int] = None,

index_type: Literal[

"IVF_FLAT", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ"

] = "IVF_PQ",

num_bits: int = 8,

max_iterations: int = 50,

sample_rate: int = 256,

m: int = 20,

ef_construction: int = 300,

"""Create an index on the table.

Parameters

----------

metric: str, default "L2"

The distance metric to use when creating the index.

Valid values are "L2", "cosine", "dot", or "hamming".

L2 is euclidean distance.

Hamming is available only for binary vectors.

num_partitions: int, default 256

The number of IVF partitions to use when creating the index.

Default is 256.

num_sub_vectors: int, default 96

The number of PQ sub-vectors to use when creating the index.

Default is 96.

vector_column_name: str, default "vector"

The vector column name to create the index.

replace: bool, default True

- If True, replace the existing index if it exists.

- If False, raise an error if duplicate index exists.

accelerator: str, default None

If set, use the given accelerator to create the index.

Only support "cuda" for now.

index_cache_size : int, optional

The size of the index cache in number of entries. Default value is 256.

num_bits: int

The number of bits to encode sub-vectors. Only used with the IVF_PQ index.

Only 4 and 8 are supported.

"""

raise NotImplementedError

def drop_index(self, name: str) -> None:

"""

Drop an index from the table.

Parameters

----------

name: str

The name of the index to drop.

Notes

-----

This does not delete the index from disk, it just removes it from the table.

To delete the index, run [optimize][lancedb.table.Table.optimize]

after dropping the index.

Use [list_indices][lancedb.table.Table.list_indices] to find the names of

the indices.

"""

raise NotImplementedError

@abstractmethod

def create_scalar_index(

self,

column: str,

replace: bool = True,

index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",

"""Create a scalar index on a column.

Parameters

----------

column : str

The column to be indexed. Must be a boolean, integer, float,

or string column.

replace : bool, default True

Replace the existing index if it exists.

index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"], default "BTREE"

The type of index to create.

Examples

--------

Scalar indices, like vector indices, can be used to speed up scans. A scalar

index can speed up scans that contain filter expressions on the indexed column.

For example, the following scan will be faster if the column ``my_col`` has

a scalar index:

>>> import lancedb # doctest: +SKIP

>>> db = lancedb.connect("/data/lance") # doctest: +SKIP

>>> img_table = db.open_table("images") # doctest: +SKIP

>>> my_df = img_table.search().where("my_col = 7", # doctest: +SKIP

... prefilter=True).to_pandas()

Scalar indices can also speed up scans containing a vector search and a

prefilter:

>>> import lancedb # doctest: +SKIP

>>> db = lancedb.connect("/data/lance") # doctest: +SKIP

>>> img_table = db.open_table("images") # doctest: +SKIP

>>> img_table.search([1, 2, 3, 4], vector_column_name="vector") # doctest: +SKIP

... .where("my_col != 7", prefilter=True)

... .to_pandas()

Scalar indices can only speed up scans for basic filters using

equality, comparison, range (e.g. ``my_col BETWEEN 0 AND 100``), and set

membership (e.g. `my_col IN (0, 1, 2)`)

Scalar indices can be used if the filter contains multiple indexed columns and

the filter criteria are AND'd or OR'd together

(e.g. ``my_col < 0 AND other_col> 100``)

Scalar indices may be used if the filter contains non-indexed columns but,

depending on the structure of the filter, they may not be usable. For example,

if the column ``not_indexed`` does not have a scalar index then the filter

``my_col = 0 OR not_indexed = 1`` will not be able to use any scalar index on

``my_col``.

"""

raise NotImplementedError

def create_fts_index(

self,

field_names: Union[str, List[str]],

ordering_field_names: Optional[Union[str, List[str]]] = None,

replace: bool = False,

writer_heap_size: Optional[int] = 1024 * 1024 * 1024,

use_tantivy: bool = True,

tokenizer_name: Optional[str] = None,

with_position: bool = True,

# tokenizer configs:

base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple",

language: str = "English",

max_token_length: Optional[int] = 40,

lower_case: bool = True,

stem: bool = False,

remove_stop_words: bool = False,

ascii_folding: bool = False,

"""Create a full-text search index on the table.

Warning - this API is highly experimental and is highly likely to change

in the future.

Parameters

----------

field_names: str or list of str

The name(s) of the field to index.

can be only str if use_tantivy=True for now.

replace: bool, default False

If True, replace the existing index if it exists. Note that this is

not yet an atomic operation; the index will be temporarily

unavailable while the new index is being created.

writer_heap_size: int, default 1GB

Only available with use_tantivy=True

ordering_field_names:

A list of unsigned type fields to index to optionally order

results on at search time.

only available with use_tantivy=True

tokenizer_name: str, default "default"

The tokenizer to use for the index. Can be "raw", "default" or the 2 letter

language code followed by "_stem". So for english it would be "en_stem".

For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html

use_tantivy: bool, default True

If True, use the legacy full-text search implementation based on tantivy.

If False, use the new full-text search implementation based on lance-index.

with_position: bool, default True

Only available with use_tantivy=False

If False, do not store the positions of the terms in the text.

This can reduce the size of the index and improve indexing speed.

But it will raise an exception for phrase queries.

base_tokenizer : str, default "simple"

The base tokenizer to use for tokenization. Options are:

- "simple": Splits text by whitespace and punctuation.

- "whitespace": Split text by whitespace, but not punctuation.

- "raw": No tokenization. The entire text is treated as a single token.

language : str, default "English"

The language to use for tokenization.

max_token_length : int, default 40

The maximum token length to index. Tokens longer than this length will be

ignored.

lower_case : bool, default True

Whether to convert the token to lower case. This makes queries

case-insensitive.

stem : bool, default False

Whether to stem the token. Stemming reduces words to their root form.

For example, in English "running" and "runs" would both be reduced to "run".

remove_stop_words : bool, default False

Whether to remove stop words. Stop words are common words that are often

removed from text before indexing. For example, in English "the" and "and".

ascii_folding : bool, default False

Whether to fold ASCII characters. This converts accented characters to

their ASCII equivalent. For example, "café" would be converted to "cafe".

"""

raise NotImplementedError

@abstractmethod

def add(

self,

data: DATA,

mode: str = "append",

on_bad_vectors: str = "error",

fill_value: float = 0.0,

"""Add more data to the [Table](Table).

Parameters

----------

data: DATA

The data to insert into the table. Acceptable types are:

- list-of-dict

- pandas.DataFrame

- pyarrow.Table or pyarrow.RecordBatch

mode: str

The mode to use when writing the data. Valid values are

"append" and "overwrite".

on_bad_vectors: str, default "error"

What to do if any of the vectors are not the same size or contains NaNs.

One of "error", "drop", "fill".

fill_value: float, default 0.

The value to use when filling vectors. Only used if on_bad_vectors="fill".

"""

raise NotImplementedError

def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:

"""

Returns a [`LanceMergeInsertBuilder`][lancedb.merge.LanceMergeInsertBuilder]

that can be used to create a "merge insert" operation

This operation can add rows, update rows, and remove rows all in a single

transaction. It is a very generic tool that can be used to create

behaviors like "insert if not exists", "update or insert (i.e. upsert)",

or even replace a portion of existing data with new data (e.g. replace

all data where month="january")

The merge insert operation works by combining new data from a

**source table** with existing data in a **target table** by using a

join. There are three categories of records.

"Matched" records are records that exist in both the source table and

the target table. "Not matched" records exist only in the source table

(e.g. these are new data) "Not matched by source" records exist only

in the target table (this is old data)

The builder returned by this method can be used to customize what

should happen for each category of data.

Please note that the data may appear to be reordered as part of this

operation. This is because updated rows will be deleted from the

dataset and then reinserted at the end with the new values.

Parameters

----------

on: Union[str, Iterable[str]]

A column (or columns) to join on. This is how records from the

source table and target table are matched. Typically this is some

kind of key or id column.

Examples

--------

>>> import lancedb

>>> data = pa.table({"a": [2, 1, 3], "b": ["a", "b", "c"]})

>>> db = lancedb.connect("./.lancedb")

>>> table = db.create_table("my_table", data)

>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})

>>> # Perform a "upsert" operation

>>> table.merge_insert("a") \\

... .when_matched_update_all() \\

... .when_not_matched_insert_all() \\

... .execute(new_data)

>>> # The order of new rows is non-deterministic since we use

>>> # a hash-join as part of this operation and so we sort here

>>> table.to_arrow().sort_by("a").to_pandas()

a b

0 1 b

1 2 x

2 3 y

3 4 z

"""

on = [on] if isinstance(on, str) else list(on.iter())

return LanceMergeInsertBuilder(self, on)

@abstractmethod

def search(

self,

query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None,

vector_column_name: Optional[str] = None,

query_type: QueryType = "auto",

ordering_field_name: Optional[str] = None,

fts_columns: Optional[Union[str, List[str]]] = None,

) -> LanceQueryBuilder:

"""Create a search query to find the nearest neighbors

of the given query vector. We currently support [vector search][search]

and [full-text search][experimental-full-text-search].

All query options are defined in [Query][lancedb.query.Query].

Examples

--------

>>> import lancedb

>>> db = lancedb.connect("./.lancedb")

>>> data = [

... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},

... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]},

... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}

... ]

>>> table = db.create_table("my_table", data)

>>> query = [0.4, 1.4, 2.4]

>>> (table.search(query)

... .where("original_width > 1000", prefilter=True)

... .select(["caption", "original_width", "vector"])

... .limit(2)

... .to_pandas())

caption original_width vector _distance

0 foo 2000 [0.5, 3.4, 1.3] 5.220000

1 test 3000 [0.3, 6.2, 2.6] 23.089996

Parameters

----------

query: list/np.ndarray/str/PIL.Image.Image, default None

The targetted vector to search for.

- *default None*.

Acceptable types are: list, np.ndarray, PIL.Image.Image

- If None then the select/where/limit clauses are applied to filter

the table

vector_column_name: str, optional

The name of the vector column to search.

The vector column needs to be a pyarrow fixed size list type

- If not specified then the vector column is inferred from

the table schema

- If the table has multiple vector columns then the *vector_column_name*

needs to be specified. Otherwise, an error is raised.

query_type: str

*default "auto"*.

Acceptable types are: "vector", "fts", "hybrid", or "auto"

- If "auto" then the query type is inferred from the query;

- If `query` is a list/np.ndarray then the query type is

"vector";

- If `query` is a PIL.Image.Image then either do vector search,

or raise an error if no corresponding embedding function is found.

- If `query` is a string, then the query type is "vector" if the

table has embedding functions else the query type is "fts"

Returns

-------

LanceQueryBuilder

A query builder object representing the query.

Once executed, the query returns

- selected columns

- the vector

- and also the "_distance" column which is the distance between the query

vector and the returned vector.

"""

raise NotImplementedError

@abstractmethod

def _execute_query(

self, query: Query, batch_size: Optional[int] = None

) -> pa.RecordBatchReader: ...

@abstractmethod

def _do_merge(

self,

merge: LanceMergeInsertBuilder,

new_data: DATA,

on_bad_vectors: str,

fill_value: float,

): ...

@abstractmethod

def delete(self, where: str):

"""Delete rows from the table.

This can be used to delete a single row, many rows, all rows, or

sometimes no rows (if your predicate matches nothing).

Parameters

----------

where: str

The SQL where clause to use when deleting rows.

- For example, 'x = 2' or 'x IN (1, 2, 3)'.

The filter must not be empty, or it will error.

Examples

--------

>>> import lancedb

>>> data = [

... {"x": 1, "vector": [1.0, 2]},

... {"x": 2, "vector": [3.0, 4]},

... {"x": 3, "vector": [5.0, 6]}

... ]

>>> db = lancedb.connect("./.lancedb")

>>> table = db.create_table("my_table", data)

>>> table.to_pandas()

x vector

0 1 [1.0, 2.0]

1 2 [3.0, 4.0]

2 3 [5.0, 6.0]

>>> table.delete("x = 2")

>>> table.to_pandas()

x vector

0 1 [1.0, 2.0]

1 3 [5.0, 6.0]

If you have a list of values to delete, you can combine them into a

stringified list and use the `IN` operator:

>>> to_remove = [1, 5]

>>> to_remove = ", ".join([str(v) for v in to_remove])

>>> to_remove

'1, 5'

>>> table.delete(f"x IN ({to_remove})")

>>> table.to_pandas()

x vector

0 3 [5.0, 6.0]

"""

raise NotImplementedError

@abstractmethod

def update(

self,

where: Optional[str] = None,

values: Optional[dict] = None,

values_sql: Optional[Dict[str, str]] = None,

"""

This can be used to update zero to all rows depending on how many

rows match the where clause. If no where clause is provided, then

all rows will be updated.

Either `values` or `values_sql` must be provided. You cannot provide

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

table.py

Latest commit

History

table.py

File metadata and controls