From da12d4971e83f4c01ee8c5d66194914327600165 Mon Sep 17 00:00:00 2001 From: Griffin Roupe Date: Thu, 10 Aug 2023 11:20:27 -0400 Subject: [PATCH 1/3] Fix pagination on concept search with type filter active --- src/dug/core/async_search.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 1f3d5872..378c2200 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -101,7 +101,7 @@ async def agg_data_type(self): return data_type_list @staticmethod - def _build_concepts_query(query, fuzziness=1, prefix_length=3): + def _build_concepts_query(query, types, fuzziness=1, prefix_length=3): "Static data structure populator, pulled for easier testing" query_object = { "bool": { @@ -111,7 +111,15 @@ def _build_concepts_query(query, fuzziness=1, prefix_length=3): {"wildcard": {"description": "?*"}}, {"wildcard": {"name": "?*"}} ] - } + }, + **({ + "bool": { + "should": [ + {'term': {'type': {'value': t}}} for t in types + ], + "minimum_should_match": 1 + } + } if isinstance(types, list) else {}) }, "should": [ { @@ -211,24 +219,10 @@ async def search_concepts(self, query, offset=0, size=None, types=None, """ Changed to a long boolean match query to optimize search results """ - query_dict = self._build_concepts_query(query, **kwargs) - total_items = await self.es.count( - body={"query": query_dict}, - index="concepts_index") + query_dict = self._build_concepts_query(query, types, **kwargs) # Get aggregated counts of biolink types search_body = {"query": query_dict} search_body['aggs'] = {'type-count': {'terms': {'field': 'type'}}} - # Add post_filter on types - if types: - assert isinstance(types, list) - search_body['post_filter'] = { - "bool": { - "should": [ - {'term': {'type': {'value': t}}} for t in types - ], - "minimum_should_match": 1 - } - } search_results = await self.es.search( index="concepts_index", body=search_body, @@ -239,6 +233,11 @@ async def search_concepts(self, query, offset=0, size=None, types=None, size=size, explain=True ) + del search_body["aggs"] + total_items = await self.es.count( + body=search_body, + index="concepts_index" + ) # Simplify the data structure we get from aggregations to put into the # return value. This should be a count of documents hit for every type From 6b6c2a61eb20c1782062955390b69a068886e584 Mon Sep 17 00:00:00 2001 From: Griffin Roupe Date: Thu, 10 Aug 2023 11:30:55 -0400 Subject: [PATCH 2/3] Fix concept search test --- src/dug/core/async_search.py | 2 +- tests/unit/test_async_search.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 378c2200..43c08e19 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -101,7 +101,7 @@ async def agg_data_type(self): return data_type_list @staticmethod - def _build_concepts_query(query, types, fuzziness=1, prefix_length=3): + def _build_concepts_query(query, types=None, fuzziness=1, prefix_length=3): "Static data structure populator, pulled for easier testing" query_object = { "bool": { diff --git a/tests/unit/test_async_search.py b/tests/unit/test_async_search.py index c121ce7c..40a39e85 100644 --- a/tests/unit/test_async_search.py +++ b/tests/unit/test_async_search.py @@ -28,7 +28,7 @@ def setUp(self): "Build mock elasticsearch responses" search_result = _brain_search_result() self.search = async_search.Search(Config.from_env()) - self.query_body = self.search._build_concepts_query("brain") + self.query_body = self.search._build_concepts_query("brain", types=None) self.search.es = es_mock def test_concepts_search(self): From 063af0e112c5e50597d250a4c75d53073ab6318a Mon Sep 17 00:00:00 2001 From: Griffin Roupe Date: Thu, 10 Aug 2023 11:57:39 -0400 Subject: [PATCH 3/3] Fix type filter pagination without messing up type count aggregation --- src/dug/core/async_search.py | 30 +++++++++++++++++++----------- tests/unit/test_async_search.py | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 43c08e19..6946fb74 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -101,7 +101,7 @@ async def agg_data_type(self): return data_type_list @staticmethod - def _build_concepts_query(query, types=None, fuzziness=1, prefix_length=3): + def _build_concepts_query(query, fuzziness=1, prefix_length=3): "Static data structure populator, pulled for easier testing" query_object = { "bool": { @@ -111,15 +111,7 @@ def _build_concepts_query(query, types=None, fuzziness=1, prefix_length=3): {"wildcard": {"description": "?*"}}, {"wildcard": {"name": "?*"}} ] - }, - **({ - "bool": { - "should": [ - {'term': {'type': {'value': t}}} for t in types - ], - "minimum_should_match": 1 - } - } if isinstance(types, list) else {}) + } }, "should": [ { @@ -219,10 +211,19 @@ async def search_concepts(self, query, offset=0, size=None, types=None, """ Changed to a long boolean match query to optimize search results """ - query_dict = self._build_concepts_query(query, types, **kwargs) + query_dict = self._build_concepts_query(query, **kwargs) # Get aggregated counts of biolink types search_body = {"query": query_dict} search_body['aggs'] = {'type-count': {'terms': {'field': 'type'}}} + if isinstance(types, list): + search_body['post_filter'] = { + "bool": { + "should": [ + {'term': {'type': {'value': t}}} for t in types + ], + "minimum_should_match": 1 + } + } search_results = await self.es.search( index="concepts_index", body=search_body, @@ -233,7 +234,14 @@ async def search_concepts(self, query, offset=0, size=None, types=None, size=size, explain=True ) + # Aggs/post_filter aren't supported by count del search_body["aggs"] + if "post_filter" in search_body: + # We'll move the post_filter into the actual filter + search_body["query"]["bool"]["filter"]["bool"].update( + search_body["post_filter"]["bool"] + ) + del search_body["post_filter"] total_items = await self.es.count( body=search_body, index="concepts_index" diff --git a/tests/unit/test_async_search.py b/tests/unit/test_async_search.py index 40a39e85..c121ce7c 100644 --- a/tests/unit/test_async_search.py +++ b/tests/unit/test_async_search.py @@ -28,7 +28,7 @@ def setUp(self): "Build mock elasticsearch responses" search_result = _brain_search_result() self.search = async_search.Search(Config.from_env()) - self.query_body = self.search._build_concepts_query("brain", types=None) + self.query_body = self.search._build_concepts_query("brain") self.search.es = es_mock def test_concepts_search(self):