Skip to content

Commit dc45bbd

Browse files
authored
Merge pull request #173 from cul-it/bug/ARXIVNG-557
ARXIVNG-549 suppress stopwords in author search
2 parents 06e07b1 + 7e88855 commit dc45bbd

File tree

3 files changed

+67
-18
lines changed

3 files changed

+67
-18
lines changed

search/services/index/authors.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,32 @@
11
"""Query-builders and helpers for searching by author name."""
22

33
from typing import Tuple, Optional, List
4+
import re
5+
from string import punctuation
46
from elasticsearch_dsl import Search, Q, SF
57
from .util import wildcardEscape, is_literal_query, Q_, escape
68

79

10+
STOP = ["and", "or", "the", "of", "a", "for", "an"]
11+
12+
13+
# TODO: remove this when we address the author name bug in
14+
# search.process.transform..
15+
def _strip_punctuation(s: str) -> str:
16+
return ''.join([c for c in s if c not in punctuation])
17+
18+
19+
# TODO: revisit author name indexing in document mappings.
20+
# Ideally stopwords would be removed at index time, but authors are indexed
21+
# as keywords which makes that difficult.
22+
def _remove_stopwords(term: str) -> str:
23+
"""Remove common stopwords that will match on institutions."""
24+
_term = str(term)
25+
for stopword in STOP:
26+
_term =re.sub(f"(^|\s+){stopword}(\s+|$)", " ", _term)
27+
return _term
28+
29+
830
def _parseName(au_safe: str) -> Tuple[str, Optional[str]]:
931
"""Parse a name string into its (likely) constituent parts."""
1032
# We interpret the comma as separating the surname from the forename.
@@ -39,17 +61,25 @@ def construct_author_query(term: str) -> Q:
3961
au_name, has_wildcard = wildcardEscape(au_name)
4062
au_safe = au_name.replace('*', '').replace('?', '').replace('"', '')
4163
surname_safe, forename_safe = _parseName(au_safe)
64+
4265
if forename_safe is not None:
66+
# TODO: remove this when the author name bug is fixed in
67+
# search.process.transform. Since we are erroneously removing
68+
# punctuation from author names prior to indexing, it's important
69+
# to do the same here so that results are returned.
70+
forename_safe = _strip_punctuation(forename_safe)
71+
4372
fullname_safe = f'{forename_safe} {surname_safe}'
4473
else:
4574
fullname_safe = surname_safe
4675
_q = (
4776
# Matching on keyword field is effectively an exact match.
4877
Q('match', **{
4978
'authors__full_name__exact': {
50-
'query': fullname_safe, 'boost': 10
51-
}
79+
'query': fullname_safe, 'boost': 30
80+
},
5281
})
82+
5383
# The next best case is that the query is a substring of
5484
# the full name.
5585
| Q('match_phrase', **{
@@ -58,9 +88,14 @@ def construct_author_query(term: str) -> Q:
5888
)
5989
if not is_literal_query(term):
6090
# Search across all authors, and prefer documents for which a
61-
# greater number of authors respond.
62-
_q |= Q('multi_match', fields=['authors*'], query=term, boost=20,
63-
type="cross_fields")
91+
# greater number of authors respond. For this part of the search
92+
# we want to avoid artificially high scores when only initials
93+
# match, so we drop solo characters from the query.
94+
term_sans_inits = ' '.join(part for part in
95+
_remove_stopwords(term).split()
96+
if len(part) > 1)
97+
_q |= Q('multi_match', fields=['authors.full_name'],
98+
query=term_sans_inits, boost=8, type="cross_fields")
6499
# We support wildcards (?*) within each author name. Since
65100
# ES will treat the non-wildcard part of the term as a literal,
66101
# we need to apply each word in the name separately.

search/services/index/prepare.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,18 @@ def _field_term_to_q(field: str, term: str) -> Q:
8383
# prefer them to partial matches within TeXisms.
8484
if is_tex_query(term):
8585
return Q("match", **{f'{field}.tex': {'query': term, 'boost': 2}})
86+
87+
# "english" fields are analyzed with the English stoplist, so they're
88+
# safe for all kinds of searches.
89+
_fields = [f'{field}.english', f'{field}_utf8__english']
90+
91+
# If this is a literal query, however, we should search against the
92+
# the base field, too.
93+
if is_literal_query(term_sans_tex):
94+
_fields += [field, f'{field}_utf8']
95+
8696
q = (
87-
Q("query_string", fields=[
88-
field,
89-
f'{field}_utf8',
90-
f'{field}__english',
91-
f'{field}_utf8__english'
92-
],
97+
Q("query_string", fields=_fields,
9398
default_operator='AND',
9499
analyze_wildcard=True,
95100
allow_leading_wildcard=False,
@@ -193,8 +198,8 @@ def _fielded_terms_to_q(query: AdvancedQuery) -> Match:
193198
# authors respond and also titles (and, to a lesser extent,
194199
# abstracts) respond.
195200
q |= Q("multi_match",
196-
fields=["title*^30", "abstract*^10", "authors*"],
197-
query=escape(term.term), boost=4, type="cross_fields")
201+
fields=["title.english^30", "abstract.english*^10"],
202+
query=term.term, boost=4, type="cross_fields")
198203
else:
199204
q = _field_term_to_q(term.field, term.term)
200205

@@ -212,17 +217,17 @@ def simple(search: Search, query: SimpleQuery) -> Search:
212217
q_ar = [_field_term_to_q(field, query.value)
213218
for field in use]
214219
q = reduce(ior, q_ar)
220+
215221
if not is_literal_query(query.value):
216222
# When searching in "all fields", users will include terms from
217223
# various different fields. This additional multi-match treats
218-
# title, abstract, and authors as one big field, and boosts
224+
# title and abstract as one big field, and boosts
219225
# matching results. Since authors get boosted strongly elsewhere,
220226
# this effectively surfaces results for which authors respond and
221227
# also titles (and, to a lesser extent, abstracts) respond.
222228
q |= Q("multi_match",
223-
fields=["title*^30", "abstract*^10", "authors*"],
229+
fields=["title.english^30", "abstract.english*^10"],
224230
query=query.value, boost=4, type="cross_fields")
225-
pass
226231
else:
227232
q = _field_term_to_q(query.search_field, query.value)
228233
search = search.query(q)
@@ -261,6 +266,7 @@ def highlight(search: Search) -> Search:
261266
post_tags=[HIGHLIGHT_TAG_CLOSE]
262267
)
263268
search = search.highlight('title', type='plain', number_of_fragments=0)
269+
search = search.highlight('title.english', type='plain', number_of_fragments=0)
264270
search = search.highlight('title.tex', type='plain', number_of_fragments=0)
265271
search = search.highlight('title_utf8', type='plain',
266272
number_of_fragments=0)
@@ -279,4 +285,6 @@ def highlight(search: Search) -> Search:
279285
search = search.highlight('abstract', type='plain', number_of_fragments=0)
280286
search = search.highlight('abstract.tex', type='plain',
281287
number_of_fragments=0)
288+
search = search.highlight('abstract.english', type='plain',
289+
number_of_fragments=0)
282290
return search

search/services/index/results.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ def _add_highlighting(result: dict, raw: Response) -> dict:
148148
# To guard against this while preserving highlighting, we move
149149
# any highlighting tags from within TeXisms to encapsulate the
150150
# entire TeXism.
151-
if field in ['title', 'title_utf8', 'abstract']:
151+
if field in ['title', 'title_utf8', 'title.english', 'abstract',
152+
'abstract.english']:
152153
value = _highlight_whole_texism(value)
153154

154155
# A hit on authors may originate in several different fields, most
@@ -168,13 +169,18 @@ def _add_highlighting(result: dict, raw: Response) -> dict:
168169
result['highlight'][field] = \
169170
result['highlight'].pop(f'{field}.tex')
170171

171-
for field in ['abstract.tex', 'abstract_utf8', 'abstract']:
172+
for field in ['abstract.tex', 'abstract.english', 'abstract_utf8',
173+
'abstract']:
172174
if field in result['highlight']:
173175
value = result['highlight'][field]
174176
abstract_snippet = _preview(value)
175177
result['preview']['abstract'] = abstract_snippet
176178
result['highlight']['abstract'] = value
177179
break
180+
for field in ['title.english', 'title_utf8', 'title']:
181+
if field in result['highlight']:
182+
result['highlight']['title'] = result['highlight'][field]
183+
break
178184
return result
179185

180186

0 commit comments

Comments
 (0)