Skip to content

Commit 693c0ff

Browse files
authored
Merge pull request #190 from cul-it/bug/ARXIVNG-811
ARXIVNG-811 ARXIVNG-819 announcement date queries, author IDs
2 parents 9717326 + 6f82198 commit 693c0ff

File tree

12 files changed

+297
-23
lines changed

12 files changed

+297
-23
lines changed

search/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,8 @@
224224
FLASKS3_ACTIVE = os.environ.get('FLASKS3_ACTIVE', 0)
225225

226226
# Settings for display of release information
227-
RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/fjs2FQ'
228-
RELEASE_NOTES_TEXT = 'Search v0.2 released 2018-05-04'
227+
RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/mBtOFQ'
228+
RELEASE_NOTES_TEXT = 'Search v0.3 released 2018-05-14'
229229

230230

231231
# TODO: one place to set the version, update release notes text, JIRA issue

search/services/index/authors.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def author_query(term: str, operator: str = 'AND') -> Q:
221221

222222
def author_id_query(term: str, operator: str = 'and') -> Q:
223223
"""Generate a query part for Author ID using the ES DSL."""
224+
term = term.lower() # Just in case.
224225
if operator == 'or':
225226
return (
226227
Q("nested", path="owners",

search/services/index/highlighting.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ def highlight(search: Search) -> Search:
5050
search = search.highlight('comments', number_of_fragments=0)
5151
# Highlight any field the name of which begins with "author".
5252
search = search.highlight('author*')
53+
search = search.highlight('owner*')
54+
search = search.highlight('submitter*')
5355
search = search.highlight('journal_ref', type='plain')
5456
search = search.highlight('acm_class', number_of_fragments=0)
5557
search = search.highlight('msc_class', number_of_fragments=0)
@@ -63,6 +65,9 @@ def highlight(search: Search) -> Search:
6365
number_of_fragments=0)
6466
search = search.highlight('abstract.english', type='plain',
6567
number_of_fragments=0)
68+
69+
search = search.highlight('primary_classification*', type='plain',
70+
number_of_fragments=0)
6671
return search
6772

6873

@@ -164,6 +169,9 @@ def add_highlighting(result: dict, raw: Response) -> dict:
164169
if hasattr(value, '__iter__'):
165170
value = '…'.join(value)
166171

172+
if 'primary_classification' in field:
173+
field = 'primary_classification'
174+
167175
# Non-TeX searches may hit inside of TeXisms. Highlighting those
168176
# fragments (i.e. inserting HTML) will break MathJax rendering.
169177
# To guard against this while preserving highlighting, we move
@@ -178,7 +186,8 @@ def add_highlighting(result: dict, raw: Response) -> dict:
178186
# truncated. So instead of highlighting author names themselves, we
179187
# set a 'flag' that can get picked up in the template and highlight
180188
# the entire author field.
181-
if field.startswith('author'):
189+
if field.startswith('author') or field.startswith('owner') \
190+
or field.startswith('submitter'):
182191
field = 'author'
183192
value = True
184193
result['highlight'][field] = value

search/services/index/prepare.py

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
See :func:`._query_all_fields` for information on how results are scored.
88
"""
99

10-
from typing import Any, List, Tuple, Callable, Dict
10+
from typing import Any, List, Tuple, Callable, Dict, Optional
1111
from functools import reduce, wraps
1212
from operator import ior, iand
1313
import re
@@ -19,7 +19,7 @@
1919

2020
from search.domain import SimpleQuery, Query, AdvancedQuery, Classification
2121
from .util import strip_tex, Q_, is_tex_query, is_literal_query, escape, \
22-
wildcardEscape, remove_single_characters, has_wildcard
22+
wildcardEscape, remove_single_characters, has_wildcard, match_date_partial
2323
from .highlighting import HIGHLIGHT_TAG_OPEN, HIGHLIGHT_TAG_CLOSE
2424
from .authors import author_query, author_id_query, orcid_query
2525

@@ -85,6 +85,19 @@ def _query_doi(term: str, operator: str = 'and') -> Q:
8585

8686

8787
def _query_primary(term: str, operator: str = 'and') -> Q:
88+
# In the 'or' case, we're basically just looking for hit highlighting
89+
# after a match on the combined field. Since primary classification fields
90+
# are keyword fields, they won't match the same way as the combined field
91+
# (text). So we have to be a bit fuzzy here to get the highlight.
92+
# TODO: in a future version, we should consider changes to the mappings
93+
# to make this more straightforward.
94+
if operator == 'or':
95+
return reduce(ior, [(
96+
Q("match", **{"primary_classification__category__id": {"query": part, "operator": operator}})
97+
| Q("wildcard", **{"primary_classification.category.name": f"*{part}*"})
98+
| Q("match", **{"primary_classification__archive__id": {"query": part, "operator": operator}})
99+
| Q("wildcard", **{"primary_classification.archive.name": f"*{part}*"})
100+
) for part in term.split()])
88101
return (
89102
Q("match", **{"primary_classification__category__id": {"query": term, "operator": operator}})
90103
| Q("match", **{"primary_classification__category__name": {"query": term, "operator": operator}})
@@ -94,8 +107,21 @@ def _query_primary(term: str, operator: str = 'and') -> Q:
94107

95108

96109
def _query_paper_id(term: str, operator: str = 'and') -> Q:
97-
return (Q_('match', 'paper_id', escape(term), operator=operator)
98-
| Q_('match', 'paper_id_v', escape(term), operator=operator))
110+
operator = operator.lower()
111+
logger.debug(f'query paper ID with: {term}')
112+
q = (Q_('match', 'paper_id', escape(term), operator=operator)
113+
| Q_('match', 'paper_id_v', escape(term), operator=operator))
114+
return q
115+
116+
117+
def _query_combined(term: str) -> Q:
118+
# Only wildcards in literals should be escaped.
119+
wildcard_escaped, has_wildcard = wildcardEscape(term)
120+
query_term = (wildcard_escaped if has_wildcard else escape(term)).lower()
121+
122+
# All terms must match in the combined field.
123+
return Q("query_string", fields=['combined'], default_operator='AND',
124+
allow_leading_wildcard=False, query=query_term)
99125

100126

101127
def _query_all_fields(term: str) -> Q:
@@ -144,20 +170,26 @@ def _query_all_fields(term: str) -> Q:
144170
if is_tex_query(term):
145171
return _tex_query('title', term) | _tex_query('abstract', term)
146172

147-
# Only wildcards in literals should be escaped.
148-
wildcard_escaped, has_wildcard = wildcardEscape(term)
149-
query_term = wildcard_escaped if has_wildcard else escape(term)
150-
151-
# All terms must match in the combined field.
152-
_query = query_term.lower()
153-
match_all_fields = Q("query_string", fields=['combined'],
154-
default_operator='AND',
155-
allow_leading_wildcard=False,
156-
query=escape(_query))
173+
date_partial: Optional[str] = None
174+
remainder: Optional[str] = None
175+
try:
176+
date_partial, remainder = match_date_partial(term)
177+
logger.debug(f'found date partial: {date_partial}')
178+
except ValueError:
179+
pass
180+
logger.debug(f'partial: {date_partial}; rem: {remainder}')
181+
182+
match_all_fields = _query_combined(term)
183+
if date_partial:
184+
_q = Q("term", announced_date_first=date_partial)
185+
if remainder:
186+
_q &= _query_combined(remainder)
187+
match_all_fields |= _q
157188

158189
# We include matches of any term in any field, so that we can highlight
159190
# and score appropriately.
160191
queries = [
192+
_query_paper_id(term, operator='or'),
161193
author_query(term, operator='OR'),
162194
_query_title(term, default_operator='or'),
163195
_query_abstract(term, default_operator='or'),
@@ -171,7 +203,14 @@ def _query_all_fields(term: str) -> Q:
171203
_query_msc_class(term, operator='or'),
172204
_query_primary(term, operator='or')
173205
]
206+
207+
if date_partial:
208+
queries.insert(0, Q("term", announced_date_first=date_partial))
209+
210+
# If the whole query matches on a specific field, we should consider that
211+
# responsive even if the query on the combined field does not respond.
174212
conj_queries = [
213+
_query_paper_id(term, operator='AND'),
175214
author_query(term, operator='AND'),
176215
_query_title(term, default_operator='and'),
177216
_query_abstract(term, default_operator='and'),
@@ -185,6 +224,7 @@ def _query_all_fields(term: str) -> Q:
185224
_query_msc_class(term, operator='and'),
186225
_query_primary(term, operator='and')
187226
]
227+
188228
query = (match_all_fields | reduce(ior, conj_queries))
189229
query &= Q("bool", should=queries) # Partial matches across fields.
190230
scores = [SF({'weight': i + 1, 'filter': q})
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""Tests for :mod:`search.services.index.util`."""
2+
3+
from unittest import TestCase
4+
5+
from search.services.index import util
6+
7+
8+
class TestMatchDatePartial(TestCase):
9+
"""Tests for :func:`.index.util.match_date_partial`."""
10+
11+
def test_date_partial_only(self):
12+
"""Term includes only a four-digit date partial."""
13+
term = '1902'
14+
ym, rmd = util.match_date_partial(term)
15+
self.assertEqual(ym, '2019-02')
16+
self.assertEqual(rmd, '', "Should have no remainder")
17+
18+
def test_in_word(self):
19+
"""A false positive in a word."""
20+
term = 'notasearch1902foradatepartial'
21+
with self.assertRaises(ValueError):
22+
util.match_date_partial(term)
23+
24+
def test_near_words(self):
25+
"""Term includes date partial plus other terms."""
26+
term = 'foo 1902 bar'
27+
ym, rmd = util.match_date_partial(term)
28+
self.assertEqual(ym, '2019-02')
29+
self.assertEqual(rmd, "foo bar", "Should have remainder")
30+
31+
def test_out_of_range(self):
32+
"""Term looks like a date partial, but is not a valid date."""
33+
term = '0699'
34+
with self.assertRaises(ValueError):
35+
util.match_date_partial(term)
36+
37+
def test_last_millenium(self):
38+
"""Term is for a pre-2000 paper."""
39+
term = 'old paper 9505'
40+
ym, rmd = util.match_date_partial(term)
41+
self.assertEqual(ym, '1995-05')
42+
self.assertEqual(rmd, 'old paper', 'Should have a remainder')

search/services/index/util.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
'}', '[', ']', '^', '~', ':', '\\', '/', '-']
2525
DEFAULT_SORT = ['-announced_date_first', '_doc']
2626

27+
DATE_PARTIAL = r"(?:^|[\s])(\d{2})((?:0[1-9]{1})|(?:1[0-2]{1}))(?:$|[\s])"
28+
"""Used to match parts of author IDs that encode the announcement date."""
29+
2730

2831
def wildcardEscape(querystring: str) -> Tuple[str, bool]:
2932
"""
@@ -122,3 +125,37 @@ def sort(query: Query, search: Search) -> Search:
122125
if sort_params is not None:
123126
search = search.sort(*sort_params)
124127
return search
128+
129+
130+
def match_date_partial(term: str) -> Tuple[str, str]:
131+
"""
132+
Attempt to find a four-digit ID date partial (year + month).
133+
134+
This can be used to search for papers by announcement date.
135+
136+
Parameters
137+
----------
138+
term : str
139+
Search term.
140+
141+
Returns
142+
-------
143+
tuple
144+
First element is date (str) in `yyyy-MM` format, second element is the
145+
remainder of `term` (without the partial).
146+
147+
Raises
148+
------
149+
ValueError
150+
Raised if no date partial is found in `term`.
151+
152+
"""
153+
match = re.search(DATE_PARTIAL, term)
154+
if match:
155+
year, month = match.groups()
156+
# This should be fine until 2091.
157+
century = 19 if int(year) >= 91 else 20
158+
date_partial = f"{century}{year}-{month}" # year_month format in ES.
159+
remainder = term[:match.start()] + " " + term[match.end():]
160+
return date_partial, re.sub(r"\s+", " ", remainder).strip()
161+
raise ValueError('Does not include an ID date partial')

0 commit comments

Comments
 (0)