diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index b39e6a9..4318b76 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -1,5 +1,4 @@ """Implements search methods using async interfaces""" - import logging from elasticsearch import AsyncElasticsearch from elasticsearch.helpers import async_scan @@ -111,107 +110,109 @@ async def agg_data_type(self): return data_type_list @staticmethod - def _build_concepts_query(query, fuzziness=1, prefix_length=3): + def _get_concepts_query(query, fuzziness=1, prefix_length=3): "Static data structure populator, pulled for easier testing" query_object = { - "bool": { - "filter": { - "bool": { - "must": [ - {"wildcard": {"description": "?*"}}, - {"wildcard": {"name": "?*"}} - ] - } - }, - "should": [ - { - "match_phrase": { - "name": { - "query": query, - "boost": 10 - } + "query" : { + "bool": { + "filter": { + "bool": { + "must": [ + {"wildcard": {"description": "?*"}}, + {"wildcard": {"name": "?*"}} + ] } }, - { - "match_phrase": { - "description": { - "query": query, - "boost": 6 + "should": [ + { + "match_phrase": { + "name": { + "query": query, + "boost": 10 + } } - } - }, - { - "match_phrase": { - "search_terms": { - "query": query, - "boost": 8 + }, + { + "match_phrase": { + "description": { + "query": query, + "boost": 6 + } } - } - }, - { - "match": { - "name": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 4 + }, + { + "match_phrase": { + "search_terms": { + "query": query, + "boost": 8 + } } - } - }, - { - "match": { - "search_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 5 + }, + { + "match": { + "name": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "operator": "and", + "boost": 4 + } } - } - }, - { - "match": { - "description": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 3 + }, + { + "match": { + "search_terms": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "operator": "and", + "boost": 5 + } } - } - }, - { - "match": { - "description": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "boost": 2 + }, + { + "match": { + "description": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "operator": "and", + "boost": 3 + } } - } - }, - { - "match": { - "search_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "boost": 1 + }, + { + "match": { + "description": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "boost": 2 + } } - } - }, - { - "match": { - "optional_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length + }, + { + "match": { + "search_terms": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "boost": 1 + } + } + }, + { + "match": { + "optional_terms": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length + } } } - } - ], - "minimum_should_match": 1, + ], + "minimum_should_match": 1, + } } } return query_object @@ -221,9 +222,11 @@ async def search_concepts(self, query, offset=0, size=None, types=None, """ Changed to a long boolean match query to optimize search results """ - query_dict = self._build_concepts_query(query, **kwargs) + if "*" in query or "\"" in query or "+" in query or "-" in query: + search_body = self.get_simple_search_query(query) + else: + search_body = self._get_concepts_query(query, **kwargs) # Get aggregated counts of biolink types - search_body = {"query": query_dict} search_body['aggs'] = {'type-count': {'terms': {'field': 'type'}}} if isinstance(types, list): search_body['post_filter'] = { @@ -283,120 +286,7 @@ async def search_variables(self, concept="", query="", size=None, If a data_type is passed in, the result will be filtered to only contain the passed-in data type. """ - query = { - 'bool': { - 'should': { - "match": { - "identifiers": concept - } - }, - 'should': [ - { - "match_phrase": { - "element_name": { - "query": query, - "boost": 10 - } - } - }, - { - "match_phrase": { - "element_desc": { - "query": query, - "boost": 6 - } - } - }, - { - "match_phrase": { - "search_terms": { - "query": query, - "boost": 8 - } - } - }, - { - "match": { - "element_name": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 4 - } - } - }, - { - "match": { - "search_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 5 - } - } - }, - { - "match": { - "element_desc": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 3 - } - } - }, - { - "match": { - "element_desc": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "boost": 2 - } - } - }, - { - "match": { - "element_name": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "boost": 2 - } - } - }, - { - "match": { - "search_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "boost": 1 - } - } - }, - { - "match": { - "optional_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length - } - } - } - ] - } - } - - if concept: - query['bool']['must'] = { - "match": { - "identifiers": concept - } - } + query = self._get_var_query(concept, fuzziness, prefix_length, query) if index is None: index = "variables_index" body = {'query': query} @@ -410,58 +300,7 @@ async def search_variables(self, concept="", query="", size=None, size=size ) - # Reformat Results - new_results = {} - if not search_results: - # we don't want to error on a search not found - new_results.update({'total_items': total_items['count']}) - return new_results - - for elem in search_results['hits']['hits']: - elem_s = elem['_source'] - elem_type = elem_s['data_type'] - if elem_type not in new_results: - new_results[elem_type] = {} - - elem_id = elem_s['element_id'] - coll_id = elem_s['collection_id'] - elem_info = { - "description": elem_s['element_desc'], - "e_link": elem_s['element_action'], - "id": elem_id, - "name": elem_s['element_name'], - "score": round(elem['_score'], 6) - } - - # Case: collection not in dictionary for given data_type - if coll_id not in new_results[elem_type]: - # initialize document - doc = { - 'c_id': coll_id, - 'c_link': elem_s['collection_action'], - 'c_name': elem_s['collection_name'], - 'elements': [elem_info] - } - # save document - new_results[elem_type][coll_id] = doc - - # Case: collection already in dictionary for given - # element_type; append elem_info. Assumes no duplicate - # elements - else: - new_results[elem_type][coll_id]['elements'].append(elem_info) - - # Flatten dicts to list - for i in new_results: - new_results[i] = list(new_results[i].values()) - - # Return results - if bool(data_type): - if data_type in new_results: - new_results = new_results[data_type] - else: - new_results = {} - return new_results + return self._make_result(data_type, search_results['hits']['hits'], total_items, True) async def search_vars_unscored(self, concept="", query="", size=None, data_type=None, @@ -478,134 +317,23 @@ async def search_vars_unscored(self, concept="", query="", If a data_type is passed in, the result will be filtered to only contain the passed-in data type. """ - query = { - 'bool': { - 'should': { - "match": { - "identifiers": concept - } - }, - 'should': [ - { - "match_phrase": { - "element_name": { - "query": query, - "boost": 10 - } - } - }, - { - "match_phrase": { - "element_desc": { - "query": query, - "boost": 6 - } - } - }, - { - "match_phrase": { - "search_terms": { - "query": query, - "boost": 8 - } - } - }, - { - "match": { - "element_name": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 4 - } - } - }, - { - "match": { - "search_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 5 - } - } - }, - { - "match": { - "element_desc": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "operator": "and", - "boost": 3 - } - } - }, - { - "match": { - "element_desc": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "boost": 2 - } - } - }, - { - "match": { - "element_name": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "boost": 2 - } - } - }, - { - "match": { - "search_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length, - "boost": 1 - } - } - }, - { - "match": { - "optional_terms": { - "query": query, - "fuzziness": fuzziness, - "prefix_length": prefix_length - } - } - } - ] - } - } - - if concept: - query['bool']['must'] = { - "match": { - "identifiers": concept - } - } + query = self._get_var_query(concept, fuzziness, prefix_length, query) body = {'query': query} total_items = await self.es.count(body=body, index="variables_index") search_results = [] - async for r in async_scan(self.es, - query=body): + async for r in async_scan(self.es, query=body): search_results.append(r) + + return self._make_result(data_type, search_results, total_items, False) + + def _make_result(self, data_type, search_results, total_items, scored: bool): # Reformat Results new_results = {} if not search_results: # we don't want to error on a search not found new_results.update({'total_items': total_items['count']}) return new_results - for elem in search_results: elem_s = elem['_source'] elem_type = elem_s['data_type'] @@ -621,17 +349,18 @@ async def search_vars_unscored(self, concept="", query="", "name": elem_s['element_name'] } + if scored: + elem_info["score"]: round(elem['_score'], 6) + # Case: collection not in dictionary for given data_type if coll_id not in new_results[elem_type]: # initialize document - doc = {} - - # add information - doc['c_id'] = coll_id - doc['c_link'] = elem_s['collection_action'] - doc['c_name'] = elem_s['collection_name'] - doc['elements'] = [elem_info] - + doc = { + 'c_id': coll_id, + 'c_link': elem_s['collection_action'], + 'c_name': elem_s['collection_name'], + 'elements': [elem_info] + } # save document new_results[elem_type][coll_id] = doc @@ -640,17 +369,16 @@ async def search_vars_unscored(self, concept="", query="", # elements else: new_results[elem_type][coll_id]['elements'].append(elem_info) - # Flatten dicts to list for i in new_results: new_results[i] = list(new_results[i].values()) - # Return results if bool(data_type): if data_type in new_results: new_results = new_results[data_type] else: new_results = {} + new_results.update({'total_items': total_items['count']}) return new_results @@ -690,3 +418,137 @@ async def search_kg(self, unique_id, query, offset=0, size=None, ) search_results.update({'total_items': total_items['count']}) return search_results + + def _get_var_query(self, concept, fuzziness, prefix_length, query): + """Returns ES query for variable search""" + query = { + "query": { + 'bool': { + 'should': { + "match": { + "identifiers": concept + } + }, + 'should': [ + { + "match_phrase": { + "element_name": { + "query": query, + "boost": 10 + } + } + }, + { + "match_phrase": { + "element_desc": { + "query": query, + "boost": 6 + } + } + }, + { + "match_phrase": { + "search_terms": { + "query": query, + "boost": 8 + } + } + }, + { + "match": { + "element_name": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "operator": "and", + "boost": 4 + } + } + }, + { + "match": { + "search_terms": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "operator": "and", + "boost": 5 + } + } + }, + { + "match": { + "element_desc": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "operator": "and", + "boost": 3 + } + } + }, + { + "match": { + "element_desc": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "boost": 2 + } + } + }, + { + "match": { + "element_name": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "boost": 2 + } + } + }, + { + "match": { + "search_terms": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length, + "boost": 1 + } + } + }, + { + "match": { + "optional_terms": { + "query": query, + "fuzziness": fuzziness, + "prefix_length": prefix_length + } + } + } + ] + } + } + } + if concept: + query['bool']['must'] = { + "match": { + "identifiers": concept + } + } + return query + + def get_simple_search_query(self, query): + """Returns ES query that allows to use basic operators like AND, OR, NOT... + More info here https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html.""" + search_query = { + "query": { + "simple_query_string": { + "query": query, + "fields": ["name", "description", "search_terms"], + "default_operator": "and", + "flags": "OR|AND|NOT|PHRASE|PREFIX" + } + } + } + return search_query diff --git a/src/dug/core/concept_expander.py b/src/dug/core/concept_expander.py index bc8eef5..4dc5649 100644 --- a/src/dug/core/concept_expander.py +++ b/src/dug/core/concept_expander.py @@ -31,7 +31,7 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_ with open(kg_filename, 'r') as stream: response = json.load(stream) else: - query = query_factory.get_query(identifier) + query = query_factory._get_var_query(identifier) logger.debug(query) response = requests.post( url=self.url, diff --git a/tests/unit/test_async_search.py b/tests/unit/test_async_search.py index c121ce7..b044a2c 100644 --- a/tests/unit/test_async_search.py +++ b/tests/unit/test_async_search.py @@ -28,7 +28,7 @@ def setUp(self): "Build mock elasticsearch responses" search_result = _brain_search_result() self.search = async_search.Search(Config.from_env()) - self.query_body = self.search._build_concepts_query("brain") + self.query_body = self.search._get_concepts_query("brain") self.search.es = es_mock def test_concepts_search(self):