forked from lancedb/lancedb
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrrf.py
More file actions
117 lines (101 loc) · 4.59 KB
/
rrf.py
File metadata and controls
117 lines (101 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Copyright (c) 2023. LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union, List, TYPE_CHECKING
import pyarrow as pa
from collections import defaultdict
from .base import Reranker
if TYPE_CHECKING:
from ..table import LanceVectorQueryBuilder
class RRFReranker(Reranker):
"""
Reranks the results using Reciprocal Rank Fusion(RRF) algorithm based
on the scores of vector and FTS search.
Parameters
----------
K : int, default 60
A constant used in the RRF formula (default is 60). Experiments
indicate that k = 60 was near-optimal, but that the choice is
not critical. See paper:
https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf
return_score : str, default "relevance"
opntions are "relevance" or "all"
The type of score to return. If "relevance", will return only the relevance
score. If "all", will return all scores from the vector and FTS search along
with the relevance score.
"""
def __init__(self, K: int = 60, return_score="relevance"):
if K <= 0:
raise ValueError("K must be greater than 0")
super().__init__(return_score)
self.K = K
def rerank_hybrid(
self,
query: str, # noqa: F821
vector_results: pa.Table,
fts_results: pa.Table,
):
vector_ids = vector_results["_rowid"].to_pylist() if vector_results else []
fts_ids = fts_results["_rowid"].to_pylist() if fts_results else []
rrf_score_map = defaultdict(float)
# Calculate RRF score of each result
for ids in [vector_ids, fts_ids]:
for i, result_id in enumerate(ids, 1):
rrf_score_map[result_id] += 1 / (i + self.K)
# Sort the results based on RRF score
combined_results = self.merge_results(vector_results, fts_results)
combined_row_ids = combined_results["_rowid"].to_pylist()
relevance_scores = [rrf_score_map[row_id] for row_id in combined_row_ids]
combined_results = combined_results.append_column(
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
)
combined_results = combined_results.sort_by(
[("_relevance_score", "descending")]
)
if self.score == "relevance":
combined_results = self._keep_relevance_score(combined_results)
return combined_results
def rerank_multivector(
self,
vector_results: Union[List[pa.Table], List["LanceVectorQueryBuilder"]],
query: str = None,
deduplicate: bool = True, # noqa: F821 # TODO: automatically deduplicates
):
"""
Overridden method to rerank the results from multiple vector searches.
This leverages the RRF hybrid reranking algorithm to combine the
results from multiple vector searches as it doesn't support reranking
vector results individually.
"""
# Make sure all elements are of the same type
if not all(isinstance(v, type(vector_results[0])) for v in vector_results):
raise ValueError(
"All elements in vector_results should be of the same type"
)
# avoid circular import
if type(vector_results[0]).__name__ == "LanceVectorQueryBuilder":
vector_results = [result.to_arrow() for result in vector_results]
elif not isinstance(vector_results[0], pa.Table):
raise ValueError(
"vector_results should be a list of pa.Table or LanceVectorQueryBuilder"
)
# _rowid is required for RRF reranking
if not all("_rowid" in result.column_names for result in vector_results):
raise ValueError(
"'_rowid' is required for deduplication. \
add _rowid to search results like this: \
`search().with_row_id(True)`"
)
combined = pa.concat_tables(vector_results, **self._concat_tables_args)
empty_table = pa.Table.from_arrays([], names=[])
reranked = self.rerank_hybrid(query, combined, empty_table)
return reranked