From 39b40d1d8dab0dabdf80028d36d15e6ef54efc7b Mon Sep 17 00:00:00 2001 From: sengineer0 Date: Fri, 29 Jul 2022 13:29:02 +0700 Subject: [PATCH 1/4] Support: 'Fetch >1000 documents with a POST query' using with_total option --- biothings/web/query/formatter.py | 39 ++++++++++++++++++++++++++++--- biothings/web/settings/default.py | 1 + 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/biothings/web/query/formatter.py b/biothings/web/query/formatter.py index 7b1f85536..bd61d4a37 100644 --- a/biothings/web/query/formatter.py +++ b/biothings/web/query/formatter.py @@ -4,7 +4,7 @@ Transform the raw query result into consumption-friendly structures by possibly removing from, adding to, and/or flattening the raw response from the database engine for - one or more individual queries. + one or more individual queries. """ @@ -162,10 +162,21 @@ def transform(self, response, **options): native: bool, if the returned result is in python primitive types. version: bool, if _version field is kept. score: bool, if _score field is kept. - + with_total: bool, if True, the response will include max_total documents, + and a message to tell how many query terms return greater than the max_size of hits. + The default is False. + An example when with_total is True: + { + 'max_total': 100, + 'msg': '12 query terms return > 1000 hits, using from=1000 to retrieve the remaining hits', + 'hits': [...] + } """ options = dotdict(options) if isinstance(response, list): + max_total = 0 + count_by_queries = {} + responses_ = [] options.pop('one', None) # ignore template = options.pop('template', {}) @@ -174,6 +185,12 @@ def transform(self, response, **options): template_miss = options.pop('template_miss', dict(found=False)) responses = [self.transform(res, **options) for res in response] for tpl, res in zip(templates, responses): + total = res.get('total', {}).get('value') or 0 + max_total += total + if tpl['query'] not in count_by_queries: + count_by_queries[tpl['query']] = 0 + count_by_queries[tpl['query']] += total + for _res in res if isinstance(res, list) else [res]: assert isinstance(_res, dict) if _res and 'hits' not in _res: @@ -191,7 +208,23 @@ def transform(self, response, **options): hit_.update(template_hit) hit_.update(hit) responses_.append(hit_) - return list(filter(None, responses_)) + response_ = list(filter(None, responses_)) + if options.with_total: + response_ = { + 'max_total': max_total, + 'hits': response_, + } + max_size = options.size or 1000 + count_query_exceed_max_size = len([ + query for query, count in count_by_queries.items() if count >= max_size + ]) + if count_query_exceed_max_size > 0: + _from = (options['from'] or 0) + max_size + response_['msg'] = ( + f'{count_query_exceed_max_size} query terms return > {max_size} hits, ' + f'using from={_from} to retrieve the remaining hits' + ) + return response_ if isinstance(response, dict): response = self._Hits(response) diff --git a/biothings/web/settings/default.py b/biothings/web/settings/default.py index 67f9bbb5f..a21283a52 100644 --- a/biothings/web/settings/default.py +++ b/biothings/web/settings/default.py @@ -128,6 +128,7 @@ 'scopes': {'type': list, 'default': ['_id'], 'max': 1000}, 'from': {'type': int, 'max': 10000, 'alias': 'skip'}, 'sort': {'type': list, 'max': 10}, + 'with_total': {'type': bool}, } } From 5638d35fe7a7e382961bc614d0947a5fc612ecee Mon Sep 17 00:00:00 2001 From: sengineer0 Date: Mon, 1 Aug 2022 23:05:06 +0700 Subject: [PATCH 2/4] Change max_total to the maximum of an array of total values --- biothings/web/query/formatter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/biothings/web/query/formatter.py b/biothings/web/query/formatter.py index bd61d4a37..f0758c117 100644 --- a/biothings/web/query/formatter.py +++ b/biothings/web/query/formatter.py @@ -174,7 +174,6 @@ def transform(self, response, **options): """ options = dotdict(options) if isinstance(response, list): - max_total = 0 count_by_queries = {} responses_ = [] @@ -186,7 +185,6 @@ def transform(self, response, **options): responses = [self.transform(res, **options) for res in response] for tpl, res in zip(templates, responses): total = res.get('total', {}).get('value') or 0 - max_total += total if tpl['query'] not in count_by_queries: count_by_queries[tpl['query']] = 0 count_by_queries[tpl['query']] += total @@ -210,6 +208,7 @@ def transform(self, response, **options): responses_.append(hit_) response_ = list(filter(None, responses_)) if options.with_total: + max_total = max(count_by_queries.values()) response_ = { 'max_total': max_total, 'hits': response_, From e59ab6331e89b10cd5dbbf86cfcd0190d70a2bb1 Mon Sep 17 00:00:00 2001 From: sengineer0 Date: Tue, 2 Aug 2022 10:36:13 +0700 Subject: [PATCH 3/4] refactor logic to calculate max_total for query with max_total=True --- biothings/web/query/formatter.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/biothings/web/query/formatter.py b/biothings/web/query/formatter.py index f0758c117..cdc8fd9b4 100644 --- a/biothings/web/query/formatter.py +++ b/biothings/web/query/formatter.py @@ -174,7 +174,9 @@ def transform(self, response, **options): """ options = dotdict(options) if isinstance(response, list): - count_by_queries = {} + max_total = 0 + count_query_exceed_max_size = 0 + max_size = options.size or 1000 responses_ = [] options.pop('one', None) # ignore @@ -184,10 +186,12 @@ def transform(self, response, **options): template_miss = options.pop('template_miss', dict(found=False)) responses = [self.transform(res, **options) for res in response] for tpl, res in zip(templates, responses): - total = res.get('total', {}).get('value') or 0 - if tpl['query'] not in count_by_queries: - count_by_queries[tpl['query']] = 0 - count_by_queries[tpl['query']] += total + if options.with_total: + total = res.get('total', {}).get('value') or 0 + if total > max_total: + max_total = total + if total > max_size: + count_query_exceed_max_size += 1 for _res in res if isinstance(res, list) else [res]: assert isinstance(_res, dict) @@ -207,22 +211,19 @@ def transform(self, response, **options): hit_.update(hit) responses_.append(hit_) response_ = list(filter(None, responses_)) + if options.with_total: - max_total = max(count_by_queries.values()) response_ = { 'max_total': max_total, 'hits': response_, } - max_size = options.size or 1000 - count_query_exceed_max_size = len([ - query for query, count in count_by_queries.items() if count >= max_size - ]) if count_query_exceed_max_size > 0: - _from = (options['from'] or 0) + max_size + _from = (options.get('from') or 0) + max_size response_['msg'] = ( f'{count_query_exceed_max_size} query terms return > {max_size} hits, ' f'using from={_from} to retrieve the remaining hits' ) + return response_ if isinstance(response, dict): From bf16da096b0f7fd70517a1c862deb2220945e438 Mon Sep 17 00:00:00 2001 From: sengineer0 Date: Tue, 2 Aug 2022 11:17:35 +0700 Subject: [PATCH 4/4] Change max_size to 10 as it is the default size used by ES to return document --- biothings/web/query/formatter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/biothings/web/query/formatter.py b/biothings/web/query/formatter.py index cdc8fd9b4..5e82a1580 100644 --- a/biothings/web/query/formatter.py +++ b/biothings/web/query/formatter.py @@ -176,7 +176,9 @@ def transform(self, response, **options): if isinstance(response, list): max_total = 0 count_query_exceed_max_size = 0 - max_size = options.size or 1000 + # If options.set isn't set, the default number of hits returned by the ES is 10. + # Ref: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-multi-search.html#search-multi-search-api-request-body # noqa: F501 + max_size = options.size or 10 responses_ = [] options.pop('one', None) # ignore