arXiv
diff --git a/‎search/config.py‎
Lines changed: 2 additions & 2 deletions b/‎search/config.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎search/services/index/authors.py‎
Lines changed: 1 addition & 0 deletions b/‎search/services/index/authors.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎search/services/index/highlighting.py‎
Lines changed: 10 additions & 1 deletion b/‎search/services/index/highlighting.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎search/services/index/prepare.py‎
Lines changed: 54 additions & 14 deletions b/‎search/services/index/prepare.py‎
Lines changed: 54 additions & 14 deletions
diff --git a/‎search/services/index/tests/test_util.py‎
Lines changed: 42 additions & 0 deletions b/‎search/services/index/tests/test_util.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎search/services/index/util.py‎
Lines changed: 37 additions & 0 deletions b/‎search/services/index/util.py‎
Lines changed: 37 additions & 0 deletions
@@ -224,8 +224,8 @@
 FLASKS3_ACTIVE = os.environ.get('FLASKS3_ACTIVE', 0)
 
 # Settings for display of release information
-RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/fjs2FQ'
-RELEASE_NOTES_TEXT = 'Search v0.2 released 2018-05-04'
+RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/mBtOFQ'
+RELEASE_NOTES_TEXT = 'Search v0.3 released 2018-05-14'
 
 
 # TODO: one place to set the version, update release notes text, JIRA issue
 
@@ -221,6 +221,7 @@ def author_query(term: str, operator: str = 'AND') -> Q:
 
 def author_id_query(term: str, operator: str = 'and') -> Q:
     """Generate a query part for Author ID using the ES DSL."""
+    term = term.lower()     # Just in case.
     if operator == 'or':
         return (
             Q("nested", path="owners",
 
@@ -50,6 +50,8 @@ def highlight(search: Search) -> Search:
     search = search.highlight('comments', number_of_fragments=0)
     # Highlight any field the name of which begins with "author".
     search = search.highlight('author*')
+    search = search.highlight('owner*')
+    search = search.highlight('submitter*')
     search = search.highlight('journal_ref', type='plain')
     search = search.highlight('acm_class', number_of_fragments=0)
     search = search.highlight('msc_class', number_of_fragments=0)
@@ -63,6 +65,9 @@ def highlight(search: Search) -> Search:
                               number_of_fragments=0)
     search = search.highlight('abstract.english', type='plain',
                               number_of_fragments=0)
+
+    search = search.highlight('primary_classification*', type='plain',
+                              number_of_fragments=0)
     return search
 
 
@@ -164,6 +169,9 @@ def add_highlighting(result: dict, raw: Response) -> dict:
         if hasattr(value, '__iter__'):
             value = '&hellip;'.join(value)
 
+        if 'primary_classification' in field:
+            field = 'primary_classification'
+
         # Non-TeX searches may hit inside of TeXisms. Highlighting those
         # fragments (i.e. inserting HTML) will break MathJax rendering.
         # To guard against this while preserving highlighting, we move
@@ -178,7 +186,8 @@ def add_highlighting(result: dict, raw: Response) -> dict:
         # truncated. So instead of highlighting author names themselves, we
         # set a 'flag' that can get picked up in the template and highlight
         # the entire author field.
-        if field.startswith('author'):
+        if field.startswith('author') or field.startswith('owner') \
+                or field.startswith('submitter'):
             field = 'author'
             value = True
         result['highlight'][field] = value
 
@@ -7,7 +7,7 @@
 See :func:`._query_all_fields` for information on how results are scored.
 """
 
-from typing import Any, List, Tuple, Callable, Dict
+from typing import Any, List, Tuple, Callable, Dict, Optional
 from functools import reduce, wraps
 from operator import ior, iand
 import re
@@ -19,7 +19,7 @@
 
 from search.domain import SimpleQuery, Query, AdvancedQuery, Classification
 from .util import strip_tex, Q_, is_tex_query, is_literal_query, escape, \
-    wildcardEscape, remove_single_characters, has_wildcard
+    wildcardEscape, remove_single_characters, has_wildcard, match_date_partial
 from .highlighting import HIGHLIGHT_TAG_OPEN, HIGHLIGHT_TAG_CLOSE
 from .authors import author_query, author_id_query, orcid_query
 
@@ -85,6 +85,19 @@ def _query_doi(term: str, operator: str = 'and') -> Q:
 
 
 def _query_primary(term: str, operator: str = 'and') -> Q:
+    # In the 'or' case, we're basically just looking for hit highlighting
+    # after a match on the combined field. Since primary classification fields
+    # are keyword fields, they won't match the same way as the combined field
+    # (text). So we have to be a bit fuzzy here to get the highlight.
+    # TODO: in a future version, we should consider changes to the mappings
+    # to make this more straightforward.
+    if operator == 'or':
+        return reduce(ior, [(
+            Q("match", **{"primary_classification__category__id": {"query": part, "operator": operator}})
+            | Q("wildcard", **{"primary_classification.category.name": f"*{part}*"})
+            | Q("match", **{"primary_classification__archive__id": {"query": part, "operator": operator}})
+            | Q("wildcard", **{"primary_classification.archive.name": f"*{part}*"})
+        ) for part in term.split()])
     return (
         Q("match", **{"primary_classification__category__id": {"query": term, "operator": operator}})
         | Q("match", **{"primary_classification__category__name": {"query": term, "operator": operator}})
@@ -94,8 +107,21 @@ def _query_primary(term: str, operator: str = 'and') -> Q:
 
 
 def _query_paper_id(term: str, operator: str = 'and') -> Q:
-    return (Q_('match', 'paper_id', escape(term), operator=operator)
-            | Q_('match', 'paper_id_v', escape(term), operator=operator))
+    operator = operator.lower()
+    logger.debug(f'query paper ID with: {term}')
+    q = (Q_('match', 'paper_id', escape(term), operator=operator)
+         | Q_('match', 'paper_id_v', escape(term), operator=operator))
+    return q
+
+
+def _query_combined(term: str) -> Q:
+    # Only wildcards in literals should be escaped.
+    wildcard_escaped, has_wildcard = wildcardEscape(term)
+    query_term = (wildcard_escaped if has_wildcard else escape(term)).lower()
+
+    # All terms must match in the combined field.
+    return Q("query_string", fields=['combined'], default_operator='AND',
+             allow_leading_wildcard=False, query=query_term)
 
 
 def _query_all_fields(term: str) -> Q:
@@ -144,20 +170,26 @@ def _query_all_fields(term: str) -> Q:
     if is_tex_query(term):
         return _tex_query('title', term) | _tex_query('abstract', term)
 
-    # Only wildcards in literals should be escaped.
-    wildcard_escaped, has_wildcard = wildcardEscape(term)
-    query_term = wildcard_escaped if has_wildcard else escape(term)
-
-    # All terms must match in the combined field.
-    _query = query_term.lower()
-    match_all_fields = Q("query_string", fields=['combined'],
-                         default_operator='AND',
-                         allow_leading_wildcard=False,
-                         query=escape(_query))
+    date_partial: Optional[str] = None
+    remainder: Optional[str] = None
+    try:
+        date_partial, remainder = match_date_partial(term)
+        logger.debug(f'found date partial: {date_partial}')
+    except ValueError:
+        pass
+    logger.debug(f'partial: {date_partial}; rem: {remainder}')
+
+    match_all_fields = _query_combined(term)
+    if date_partial:
+        _q = Q("term", announced_date_first=date_partial)
+        if remainder:
+            _q &= _query_combined(remainder)
+        match_all_fields |= _q
 
     # We include matches of any term in any field, so that we can highlight
     # and score appropriately.
     queries = [
+        _query_paper_id(term, operator='or'),
         author_query(term, operator='OR'),
         _query_title(term, default_operator='or'),
         _query_abstract(term, default_operator='or'),
@@ -171,7 +203,14 @@ def _query_all_fields(term: str) -> Q:
         _query_msc_class(term, operator='or'),
         _query_primary(term, operator='or')
     ]
+
+    if date_partial:
+        queries.insert(0, Q("term", announced_date_first=date_partial))
+
+    # If the whole query matches on a specific field, we should consider that
+    # responsive even if the query on the combined field does not respond.
     conj_queries = [
+        _query_paper_id(term, operator='AND'),
         author_query(term, operator='AND'),
         _query_title(term, default_operator='and'),
         _query_abstract(term, default_operator='and'),
@@ -185,6 +224,7 @@ def _query_all_fields(term: str) -> Q:
         _query_msc_class(term, operator='and'),
         _query_primary(term, operator='and')
     ]
+
     query = (match_all_fields | reduce(ior, conj_queries))
     query &= Q("bool", should=queries)  # Partial matches across fields.
     scores = [SF({'weight': i + 1, 'filter': q})
 
@@ -0,0 +1,42 @@
+"""Tests for :mod:`search.services.index.util`."""
+
+from unittest import TestCase
+
+from search.services.index import util
+
+
+class TestMatchDatePartial(TestCase):
+    """Tests for :func:`.index.util.match_date_partial`."""
+
+    def test_date_partial_only(self):
+        """Term includes only a four-digit date partial."""
+        term = '1902'
+        ym, rmd = util.match_date_partial(term)
+        self.assertEqual(ym, '2019-02')
+        self.assertEqual(rmd, '', "Should have no remainder")
+
+    def test_in_word(self):
+        """A false positive in a word."""
+        term = 'notasearch1902foradatepartial'
+        with self.assertRaises(ValueError):
+            util.match_date_partial(term)
+
+    def test_near_words(self):
+        """Term includes date partial plus other terms."""
+        term = 'foo 1902 bar'
+        ym, rmd = util.match_date_partial(term)
+        self.assertEqual(ym, '2019-02')
+        self.assertEqual(rmd, "foo bar", "Should have remainder")
+
+    def test_out_of_range(self):
+        """Term looks like a date partial, but is not a valid date."""
+        term = '0699'
+        with self.assertRaises(ValueError):
+            util.match_date_partial(term)
+
+    def test_last_millenium(self):
+        """Term is for a pre-2000 paper."""
+        term = 'old paper 9505'
+        ym, rmd = util.match_date_partial(term)
+        self.assertEqual(ym, '1995-05')
+        self.assertEqual(rmd, 'old paper', 'Should have a remainder')
@@ -24,6 +24,9 @@
                       '}', '[', ']', '^', '~', ':', '\\', '/', '-']
 DEFAULT_SORT = ['-announced_date_first', '_doc']
 
+DATE_PARTIAL = r"(?:^|[\s])(\d{2})((?:0[1-9]{1})|(?:1[0-2]{1}))(?:$|[\s])"
+"""Used to match parts of author IDs that encode the announcement date."""
+
 
 def wildcardEscape(querystring: str) -> Tuple[str, bool]:
     """
@@ -122,3 +125,37 @@ def sort(query: Query, search: Search) -> Search:
     if sort_params is not None:
         search = search.sort(*sort_params)
     return search
+
+
+def match_date_partial(term: str) -> Tuple[str, str]:
+    """
+    Attempt to find a four-digit ID date partial (year + month).
+
+    This can be used to search for papers by announcement date.
+
+    Parameters
+    ----------
+    term : str
+        Search term.
+
+    Returns
+    -------
+    tuple
+        First element is date (str) in `yyyy-MM` format, second element is the
+        remainder of `term` (without the partial).
+
+    Raises
+    ------
+    ValueError
+        Raised if no date partial is found in `term`.
+
+    """
+    match = re.search(DATE_PARTIAL, term)
+    if match:
+        year, month = match.groups()
+        # This should be fine until 2091.
+        century = 19 if int(year) >= 91 else 20
+        date_partial = f"{century}{year}-{month}"   # year_month format in ES.
+        remainder = term[:match.start()] + " " + term[match.end():]
+        return date_partial, re.sub(r"\s+", " ", remainder).strip()
+    raise ValueError('Does not include an ID date partial')