Merge pull request #173 from cul-it/bug/ARXIVNG-557

erickpeirson · web-flow · commit dc45bbd1abb3 · 2018-04-26T10:25:29.000-04:00
ARXIVNG-549 suppress stopwords in author search
diff --git a/search/services/index/authors.py b/search/services/index/authors.py
@@ -1,10 +1,32 @@
 """Query-builders and helpers for searching by author name."""
 
 from typing import Tuple, Optional, List
+import re
+from string import punctuation
 from elasticsearch_dsl import Search, Q, SF
 from .util import wildcardEscape, is_literal_query, Q_, escape
 
 
+STOP = ["and", "or", "the", "of", "a", "for", "an"]
+
+
+# TODO: remove this when we address the author name bug in
+# search.process.transform..
+def _strip_punctuation(s: str) -> str:
+    return ''.join([c for c in s if c not in punctuation])
+
+
+# TODO: revisit author name indexing in document mappings.
+# Ideally stopwords would be removed at index time, but authors are indexed
+# as keywords which makes that difficult.
+def _remove_stopwords(term: str) -> str:
+    """Remove common stopwords that will match on institutions."""
+    _term = str(term)
+    for stopword in STOP:
+        _term =re.sub(f"(^|\s+){stopword}(\s+|$)", " ", _term)
+    return _term
+
+
 def _parseName(au_safe: str) -> Tuple[str, Optional[str]]:
     """Parse a name string into its (likely) constituent parts."""
     # We interpret the comma as separating the surname from the forename.
@@ -39,17 +61,25 @@ def construct_author_query(term: str) -> Q:
         au_name, has_wildcard = wildcardEscape(au_name)
         au_safe = au_name.replace('*', '').replace('?', '').replace('"', '')
         surname_safe, forename_safe = _parseName(au_safe)
+
         if forename_safe is not None:
+            # TODO: remove this when the author name bug is fixed in
+            # search.process.transform. Since we are erroneously removing
+            # punctuation from author names prior to indexing, it's important
+            # to do the same here so that results are returned.
+            forename_safe = _strip_punctuation(forename_safe)
+
             fullname_safe = f'{forename_safe} {surname_safe}'
         else:
             fullname_safe = surname_safe
         _q = (
             # Matching on keyword field is effectively an exact match.
             Q('match', **{
                 'authors__full_name__exact': {
-                    'query': fullname_safe, 'boost': 10
-                }
+                    'query': fullname_safe, 'boost': 30
+                },
             })
+
             # The next best case is that the query is a substring of
             #  the full name.
             | Q('match_phrase', **{
@@ -58,9 +88,14 @@ def construct_author_query(term: str) -> Q:
         )
         if not is_literal_query(term):
             # Search across all authors, and prefer documents for which a
-            # greater number of authors respond.
-            _q |= Q('multi_match', fields=['authors*'], query=term, boost=20,
-                    type="cross_fields")
+            # greater number of authors respond. For this part of the search
+            # we want to avoid artificially high scores when only initials
+            # match, so we drop solo characters from the query.
+            term_sans_inits = ' '.join(part for part in
+                                       _remove_stopwords(term).split()
+                                       if len(part) > 1)
+            _q |= Q('multi_match', fields=['authors.full_name'],
+                    query=term_sans_inits, boost=8, type="cross_fields")
             # We support wildcards (?*) within each author name. Since
             # ES will treat the non-wildcard part of the term as a literal,
             # we need to apply each word in the name separately.
diff --git a/search/services/index/prepare.py b/search/services/index/prepare.py
@@ -83,13 +83,18 @@ def _field_term_to_q(field: str, term: str) -> Q:
         # prefer them to partial matches within TeXisms.
         if is_tex_query(term):
             return Q("match", **{f'{field}.tex': {'query': term, 'boost': 2}})
+
+        # "english" fields are analyzed with the English stoplist, so they're
+        # safe for all kinds of searches.
+        _fields = [f'{field}.english', f'{field}_utf8__english']
+
+        # If this is a literal query, however, we should search against the
+        # the base field, too.
+        if is_literal_query(term_sans_tex):
+            _fields += [field, f'{field}_utf8']
+
         q = (
-            Q("query_string", fields=[
-                field,
-                f'{field}_utf8',
-                f'{field}__english',
-                f'{field}_utf8__english'
-              ],
+            Q("query_string", fields=_fields,
               default_operator='AND',
               analyze_wildcard=True,
               allow_leading_wildcard=False,
@@ -193,8 +198,8 @@ def _fielded_terms_to_q(query: AdvancedQuery) -> Match:
                 # authors respond and also titles (and, to a lesser extent,
                 # abstracts) respond.
                 q |= Q("multi_match",
-                       fields=["title*^30", "abstract*^10", "authors*"],
-                       query=escape(term.term), boost=4, type="cross_fields")
+                       fields=["title.english^30", "abstract.english*^10"],
+                       query=term.term, boost=4, type="cross_fields")
         else:
             q = _field_term_to_q(term.field, term.term)
 
@@ -212,17 +217,17 @@ def simple(search: Search, query: SimpleQuery) -> Search:
         q_ar = [_field_term_to_q(field, query.value)
                 for field in use]
         q = reduce(ior, q_ar)
+
         if not is_literal_query(query.value):
             # When searching in "all fields", users will include terms from
             # various different fields. This additional multi-match treats
-            # title, abstract, and authors as one big field, and boosts
+            # title and abstract as one big field, and boosts
             # matching results. Since authors get boosted strongly elsewhere,
             # this effectively surfaces results for which authors respond and
             # also titles (and, to a lesser extent, abstracts) respond.
             q |= Q("multi_match",
-                   fields=["title*^30", "abstract*^10", "authors*"],
+                   fields=["title.english^30", "abstract.english*^10"],
                    query=query.value, boost=4, type="cross_fields")
-            pass
     else:
         q = _field_term_to_q(query.search_field, query.value)
     search = search.query(q)
@@ -261,6 +266,7 @@ def highlight(search: Search) -> Search:
         post_tags=[HIGHLIGHT_TAG_CLOSE]
     )
     search = search.highlight('title', type='plain', number_of_fragments=0)
+    search = search.highlight('title.english', type='plain', number_of_fragments=0)
     search = search.highlight('title.tex', type='plain', number_of_fragments=0)
     search = search.highlight('title_utf8', type='plain',
                               number_of_fragments=0)
@@ -279,4 +285,6 @@ def highlight(search: Search) -> Search:
     search = search.highlight('abstract', type='plain', number_of_fragments=0)
     search = search.highlight('abstract.tex', type='plain',
                               number_of_fragments=0)
+    search = search.highlight('abstract.english', type='plain',
+                               number_of_fragments=0)
     return search
diff --git a/search/services/index/results.py b/search/services/index/results.py
@@ -148,7 +148,8 @@ def _add_highlighting(result: dict, raw: Response) -> dict:
         # To guard against this while preserving highlighting, we move
         # any highlighting tags from within TeXisms to encapsulate the
         # entire TeXism.
-        if field in ['title', 'title_utf8', 'abstract']:
+        if field in ['title', 'title_utf8', 'title.english', 'abstract',
+                     'abstract.english']:
             value = _highlight_whole_texism(value)
 
         # A hit on authors may originate in several different fields, most
@@ -168,13 +169,18 @@ def _add_highlighting(result: dict, raw: Response) -> dict:
             result['highlight'][field] = \
                 result['highlight'].pop(f'{field}.tex')
 
-    for field in ['abstract.tex', 'abstract_utf8', 'abstract']:
+    for field in ['abstract.tex', 'abstract.english', 'abstract_utf8',
+                  'abstract']:
         if field in result['highlight']:
             value = result['highlight'][field]
             abstract_snippet = _preview(value)
             result['preview']['abstract'] = abstract_snippet
             result['highlight']['abstract'] = value
             break
+    for field in ['title.english', 'title_utf8', 'title']:
+        if field in result['highlight']:
+            result['highlight']['title'] = result['highlight'][field]
+            break
     return result