Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for range-based ID searching #819

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
40bc413
Add new query_builder functions
ItIsJordan Jul 17, 2024
c5ef12d
Add range-based publication search functionality
ItIsJordan Jul 17, 2024
07c48c2
Add tests
ItIsJordan Jul 17, 2024
96625e1
Merge branch 'main' into search-range
ItIsJordan Jul 18, 2024
2340aba
FIx data search bug (add removed line)
ItIsJordan Jul 24, 2024
878394f
Add search term shorthand publication_recid->recid
ItIsJordan Jul 24, 2024
590deb3
Improve range query parser
ItIsJordan Jul 24, 2024
562a610
Add descending order to range query search
ItIsJordan Jul 24, 2024
e9137cf
Update tests to use correct range search syntax
ItIsJordan Jul 24, 2024
219b8ae
Add query parser test case for publication_recid
ItIsJordan Aug 6, 2024
bf4a1f2
Update search help for range-based searching
ItIsJordan Aug 7, 2024
369b4b9
Merge branch 'main' into search-range
ItIsJordan Aug 7, 2024
f856eec
Inspire ID range-searching (AND mapping change)
ItIsJordan Aug 7, 2024
44c1ce3
Update search_help for inspire ID range searching
ItIsJordan Aug 7, 2024
13e9e20
Make range query more whitespace tolerant
ItIsJordan Aug 7, 2024
b5eecf1
Add test_query_parser_is_range_query cases for whitespace tolerance
ItIsJordan Aug 7, 2024
07e4c28
Comment
ItIsJordan Aug 7, 2024
e99062b
Add test for range-searching
ItIsJordan Aug 13, 2024
43d1edb
Merge branch 'main' into search-range
ItIsJordan Aug 13, 2024
fba64f1
Update range search query test
ItIsJordan Aug 13, 2024
c06e753
Merge branch 'main' into search-range
ItIsJordan Sep 16, 2024
15208cb
Improve/fix range query functionality
ItIsJordan Sep 19, 2024
227de4d
Add tests for range queries
ItIsJordan Sep 19, 2024
19f5863
Update comments
ItIsJordan Sep 20, 2024
380bc56
Improve get_range_query
ItIsJordan Sep 20, 2024
0ea3385
Improve get_range_queries test
ItIsJordan Sep 20, 2024
2b37bb0
Merge branch 'main' into search-range
ItIsJordan Oct 3, 2024
88274d2
Improve range-search logic
ItIsJordan Oct 4, 2024
b88d1ac
Update search testing
ItIsJordan Oct 4, 2024
303058e
Add inspire_id and publication_recid as possible sorting fields
ItIsJordan Oct 4, 2024
b62f27b
Remove display of recid and inspire id search options
ItIsJordan Oct 10, 2024
c1d48fb
Fix publication_recid default sort order
ItIsJordan Oct 10, 2024
27b8fad
Add new recid and inspire_id keywords to search function docstring
ItIsJordan Oct 10, 2024
10caeef
Update verify_range_query_term docstring
ItIsJordan Oct 10, 2024
acdd8b0
Update testing to account for correct range keyword
ItIsJordan Oct 10, 2024
bdd837d
Update range query result tests
ItIsJordan Oct 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hepdata/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def _(x):
CFG_DATA_TYPE = 'datatable'
CFG_SUBMISSIONS_TYPE = 'submission'
CFG_DATA_KEYWORDS = ['observables', 'reactions', 'cmenergies', 'phrases']
CFG_SEARCH_RANGE_TERMS = ["recid", "inspire_id"] # Possible terms used to OpenSearch API range searches

CFG_CONVERTER_URL = 'https://converter.hepdata.net'
CFG_SUPPORTED_FORMATS = ['yaml', 'root', 'csv', 'yoda', 'yoda1', 'original']
Expand Down
52 changes: 34 additions & 18 deletions hepdata/ext/opensearch/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ def search(query,
('collaboration', collaboration_name), ('date', date)
:param size: [int] max number of hits that should be returned
:param offset: [int] offset for the results (used for pagination)
:param sort_by: [string] sorting field. Currently supported fields:
"title", "collaboration", "date", "relevance"
:param sort_field: [string] sorting field. Currently supported fields:
"title", "collaboration", "date", "relevance",
"recid", "inspire_id"
:param sort_order: [string] order of the sorting either original
(for a particular field) or reversed. Supported:
'' or 'rev'
Expand All @@ -108,23 +109,35 @@ def search(query,
if query == '' and not sort_field:
sort_field = 'date'

query = HEPDataQueryParser.parse_query(query)
# Create search with preference param to ensure consistency of results across shards
search = RecordsSearch(using=os, index=index).with_preference_param()

query = HEPDataQueryParser.parse_query(query)

if query:
fuzzy_query = QueryString(query=query, fuzziness='AUTO')
search.query = fuzzy_query | \
Q('has_child', type="child_datatable", query=fuzzy_query)

# Add filter to search for only "publication" objects
search = search.filter("term", doc_type=CFG_PUB_TYPE)
search = QueryBuilder.add_filters(search, filters)

# Determine if the query is range-based, and get it, or the default search order
range_term = HEPDataQueryParser.verify_range_query_term(query)

if range_term and not sort_field and not sort_order:
# Set default search keyword, and set default sort to desc
sort_field = range_term
sort_order = 'desc'

try:
mapped_sort_field = sort_fields_mapping(sort_field)
except ValueError as ve:
return {'error': str(ve)}

search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})

search = add_default_aggregations(search, filters)

if post_filter:
Expand All @@ -135,23 +148,26 @@ def search(query,

try:
pub_result = search.execute().to_dict()

parent_filter = {
"terms": {
"_id": [hit["_id"] for hit in pub_result['hits']['hits']]
data_result = None
# We don't want data tables if we're searching by publication range.
if not range_term:
parent_filter = {
"terms": {
"_id": [hit["_id"] for hit in pub_result['hits']['hits']]
}
}
}

data_search = RecordsSearch(using=os, index=index)
data_search = data_search.query('has_parent',
parent_type="parent_publication",
query=parent_filter)
if query:
data_search = data_search.query(QueryString(query=query))
data_search = RecordsSearch(using=os, index=index)
data_search = data_search.query('has_parent',
parent_type="parent_publication",
query=parent_filter)

if query:
data_search = data_search.query(QueryString(query=query))

data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
data_search = data_search[0:data_search_size]
data_result = data_search.execute().to_dict()
data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
data_search = data_search[0:data_search_size]
data_result = data_search.execute().to_dict()

merged_results = merge_results(pub_result, data_result)
return map_result(merged_results, filters)
Expand All @@ -165,7 +181,7 @@ def search(query,
else:
log.error(f'An unexpected error occurred when searching: {e}')
reason = f'An unexpected error occurred: {e.error}'
return { 'error': reason }
return {'error': reason}


@author_index
Expand Down
4 changes: 4 additions & 0 deletions hepdata/ext/opensearch/config/os_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ def sort_fields_mapping(sort_by):
return 'creation_date'
elif sort_by == 'latest':
return 'last_updated'
elif sort_by == 'recid':
return 'recid' # No change required
elif sort_by == 'inspire_id':
return 'inspire_id' # No change required
elif not sort_by or sort_by == 'relevance':
return '_score'
else:
Expand Down
2 changes: 1 addition & 1 deletion hepdata/ext/opensearch/config/record_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@
}
},
"inspire_id": {
"type": "text"
"type": "integer"
},
"keywords": {
"properties": {
Expand Down
22 changes: 19 additions & 3 deletions hepdata/ext/opensearch/process_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,26 @@
from hepdata.utils.miscellaneous import splitter


def merge_results(pub_result, data_result):
def merge_results(pub_result, data_result=None):
"""
Merge results dictionaries of publication and data table
search result data.
Data result does not exist in publication-only searches,
so defaults to None.

:param pub_result: Publication search data.
:param data_result: Data table search data.
:return: Merged search results dictionary.
"""
merge_dict = dict()
merge_dict['hits'] = pub_result['hits']['hits'] + \
data_result['hits']['hits']

# We don't need to merge if there is no data.
if data_result:
merge_dict['hits'] = pub_result['hits']['hits'] + \
data_result['hits']['hits']
else:
merge_dict['hits'] = pub_result['hits']['hits']

merge_dict['total'] = pub_result['hits']['total']['value']
merge_dict['aggregations'] = pub_result.get('aggregations', {})
return merge_dict
Expand Down
27 changes: 26 additions & 1 deletion hepdata/ext/opensearch/query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import re
from opensearch_dsl import Q

from hepdata.config import CFG_SEARCH_RANGE_TERMS


class QueryBuilder:

Expand Down Expand Up @@ -52,7 +54,8 @@ def parse_query(query_string):
"phrases": "data_keywords.phrases",
"reactions": "data_keywords.reactions",
"analysis": "analyses.type",
"resources": "resources.description" # Add shorthand for resource description
"resources": "resources.description", # Add shorthand for resource description
"publication_recid": "recid" # Shorthand for HEPData record ID
}
}

Expand Down Expand Up @@ -81,3 +84,25 @@ def _quote_phrase(phrase):
if '"' not in phrase and pattern.fullmatch(phrase):
return f'"{phrase}"'
return phrase

@staticmethod
def verify_range_query_term(query):
"""
Verifies whether a parsed query string contains a range-based query.
If it does, return either that search keyword,
or the "default" keyword for default search ordering.

Examples: publication_recid:[321 TO 321] inspire_id:[123 TO 123]

:param query: The full query string
:return: Either the range search term: inspire_id/publication_recid, or false
"""
# Pattern matching docstring example with placeholder
pattern = rf"%s:\s*\[\d+\s+TO\s+\d+]"
# For all terms that can be range searched
for term in CFG_SEARCH_RANGE_TERMS:
result = re.findall(pattern % term, query)
if result:
return term
# If no matches were ever found then we return False
return False
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,32 @@ <h4>Searching via Inspire</h4>
</ul>
</div>

<div class="well well-small">
<h4>Range-based Searching</h4>
<p>
We support searching for a range of records using their HEPData record ID or Inspire ID.
</p>
<ul>
<li>Range searching by HEPData record ID:
<ul>
<li>
<a href='/search?q=publication_recid:[1 TO 10]'
target="_new">publication_recid:[1 TO 10]</a>
</li>
</ul>
</li>
<br/>
<li>Range searching by Inspire ID:
<ul>
<li>
<a href='/search?q=inspire_id:[1 TO 10000]'
target="_new">inspire_id:[1 TO 10000]</a>
</li>
</ul>
</li>
</ul>
</div>

</div>
</div>
</div>
Expand Down
Loading