From 577dca4f7ce5bc6335fa1ea788326514eaeff240 Mon Sep 17 00:00:00 2001 From: remoteeng00 Date: Fri, 17 Feb 2023 03:20:54 +0700 Subject: [PATCH 1/2] Copy from v1 --- .../v2/genomics/mutations_by_lineage.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 web/handlers/v2/genomics/mutations_by_lineage.py diff --git a/web/handlers/v2/genomics/mutations_by_lineage.py b/web/handlers/v2/genomics/mutations_by_lineage.py new file mode 100644 index 00000000..2b23fa48 --- /dev/null +++ b/web/handlers/v2/genomics/mutations_by_lineage.py @@ -0,0 +1,65 @@ + +class MutationsByLineage(BaseHandler): + @gen.coroutine + def _get(self): + query_location = self.get_argument("location_id", None) + query_mutations = self.get_argument("mutations", None) + query_pangolin_lineage = self.get_argument("pangolin_lineage", None) + query_mutations = [muts.split(",") for muts in query_mutations.split(" AND ")] if query_mutations is not None else [] + query_frequency_threshold = self.get_argument("frequency", None) + query_frequency_threshold = float(query_frequency_threshold) if query_frequency_threshold is not None else 0 + results = {} + for muts in query_mutations: # For multiple sets of mutations, create multiple ES queries. Since AND queries are possible doing one ES query with aggregations is cumbersome. Must look for better solution here. + query = { + "size": 0, + "aggs": { + "lineage": { + "terms": {"field": "pangolin_lineage", "size": self.size}, + "aggs": { + "mutations": { + "filter": {} + } + } + } + } + } + if query_location is not None: + query["query"] = parse_location_id_to_query(query_location) + if query_pangolin_lineage is not None: + if "query" in query: # Only query added will be bool for location + query["query"]["bool"]["must"].append({ + "term": { + "pangolin_lineage": query_pangolin_lineage + } + }) + else: + query["query"] = { + "term": { + "pangolin_lineage": query_pangolin_lineage + } + } + query["aggs"]["lineage"]["aggs"]["mutations"]["filter"] = create_nested_mutation_query(mutations = muts) + resp = yield self.asynchronous_fetch(query) + path_to_results = ["aggregations", "lineage", "buckets"] + buckets = resp + for i in path_to_results: + buckets = buckets[i] + flattened_response = [] + for i in buckets: + if not i["mutations"]["doc_count"] > 0 or i["key"] == "none": + continue + flattened_response.append({ + "pangolin_lineage": i["key"], + "lineage_count": i["doc_count"], + "mutation_count": i["mutations"]["doc_count"] + }) + df_response = pd.DataFrame(flattened_response) + if df_response.shape[0] > 0: + prop = calculate_proportion(df_response["mutation_count"], df_response["lineage_count"]) + df_response.loc[:, "proportion"] = prop[0] + df_response.loc[:, "proportion_ci_lower"] = prop[1] + df_response.loc[:, "proportion_ci_upper"] = prop[2] + df_response = df_response[df_response["proportion"] >= query_frequency_threshold] + results[",".join(muts)] = df_response.to_dict(orient="records") + resp = {"success": True, "results": results} + return resp From 66105a11eca6597105d1d24ce44c284cc01e86bc Mon Sep 17 00:00:00 2001 From: remoteeng00 Date: Fri, 17 Feb 2023 03:28:58 +0700 Subject: [PATCH 2/2] Add validation rules to MutationsByLineage --- config_web/genomics.py | 4 + web/handlers/v2/genomics/__init__.py | 1 + .../v2/genomics/mutations_by_lineage.py | 90 +++++++++++-------- 3 files changed, 60 insertions(+), 35 deletions(-) diff --git a/config_web/genomics.py b/config_web/genomics.py index adf0eef1..7e5f293f 100644 --- a/config_web/genomics.py +++ b/config_web/genomics.py @@ -23,6 +23,10 @@ r"/{pre}/{ver}/prevalence-by-position", "web.handlers.v2.genomics.PrevalenceByAAPositionHandler", ), + ( + r"/{pre}/{ver}/mutations-by-lineage", + "web.handlers.v2.genomics.MutationsByLineage", + ), ] APP_LIST_SWITCHED_TO_V2 = [ diff --git a/web/handlers/v2/genomics/__init__.py b/web/handlers/v2/genomics/__init__.py index 41807705..a5fa59a3 100644 --- a/web/handlers/v2/genomics/__init__.py +++ b/web/handlers/v2/genomics/__init__.py @@ -4,4 +4,5 @@ from .location import LocationHandler from .prevalence_all_lineages_by_location import PrevalenceAllLineagesByLocationHandler from .prevalence_by_aa_position import PrevalenceByAAPositionHandler +from .mutations_by_lineage import MutationsByLineage from .prevalence_by_location_and_time import PrevalenceByLocationAndTimeHandler diff --git a/web/handlers/v2/genomics/mutations_by_lineage.py b/web/handlers/v2/genomics/mutations_by_lineage.py index 2b23fa48..1154994f 100644 --- a/web/handlers/v2/genomics/mutations_by_lineage.py +++ b/web/handlers/v2/genomics/mutations_by_lineage.py @@ -1,45 +1,61 @@ +import pandas as pd + +from web.handlers.genomics.base import BaseHandler +from web.handlers.genomics.util import ( + calculate_proportion, + create_nested_mutation_query, + parse_location_id_to_query, +) + class MutationsByLineage(BaseHandler): - @gen.coroutine - def _get(self): - query_location = self.get_argument("location_id", None) - query_mutations = self.get_argument("mutations", None) - query_pangolin_lineage = self.get_argument("pangolin_lineage", None) - query_mutations = [muts.split(",") for muts in query_mutations.split(" AND ")] if query_mutations is not None else [] - query_frequency_threshold = self.get_argument("frequency", None) - query_frequency_threshold = float(query_frequency_threshold) if query_frequency_threshold is not None else 0 + name = "mutations-by-lineage" + kwargs = dict(BaseHandler.kwargs) + kwargs["GET"] = { + "location_id": {"type": str, "default": None}, + "mutations": {"type": str, "default": None}, + "pangolin_lineage": {"type": str, "default": None}, + "frequency": {"type": float, "default": 0, "min": 0, "max": 1}, + } + + async def _get(self): + query_location = self.args.location_id + query_mutations = self.args.mutations + query_pangolin_lineage = self.args.pangolin_lineage + query_mutations = ( + [muts.split(",") for muts in query_mutations.split(" AND ")] + if query_mutations is not None + else [] + ) + query_frequency_threshold = self.args.frequency results = {} - for muts in query_mutations: # For multiple sets of mutations, create multiple ES queries. Since AND queries are possible doing one ES query with aggregations is cumbersome. Must look for better solution here. + for ( + muts + ) in ( + query_mutations + ): # For multiple sets of mutations, create multiple ES queries. Since AND queries are possible doing one ES query with aggregations is cumbersome. Must look for better solution here. query = { "size": 0, "aggs": { - "lineage": { + "lineage": { "terms": {"field": "pangolin_lineage", "size": self.size}, - "aggs": { - "mutations": { - "filter": {} - } - } + "aggs": {"mutations": {"filter": {}}}, } - } + }, } if query_location is not None: query["query"] = parse_location_id_to_query(query_location) if query_pangolin_lineage is not None: - if "query" in query: # Only query added will be bool for location - query["query"]["bool"]["must"].append({ - "term": { - "pangolin_lineage": query_pangolin_lineage - } - }) + if "query" in query: # Only query added will be bool for location + query["query"]["bool"]["must"].append( + {"term": {"pangolin_lineage": query_pangolin_lineage}} + ) else: - query["query"] = { - "term": { - "pangolin_lineage": query_pangolin_lineage - } - } - query["aggs"]["lineage"]["aggs"]["mutations"]["filter"] = create_nested_mutation_query(mutations = muts) - resp = yield self.asynchronous_fetch(query) + query["query"] = {"term": {"pangolin_lineage": query_pangolin_lineage}} + query["aggs"]["lineage"]["aggs"]["mutations"]["filter"] = create_nested_mutation_query( + mutations=muts + ) + resp = await self.asynchronous_fetch(query) path_to_results = ["aggregations", "lineage", "buckets"] buckets = resp for i in path_to_results: @@ -48,14 +64,18 @@ def _get(self): for i in buckets: if not i["mutations"]["doc_count"] > 0 or i["key"] == "none": continue - flattened_response.append({ - "pangolin_lineage": i["key"], - "lineage_count": i["doc_count"], - "mutation_count": i["mutations"]["doc_count"] - }) + flattened_response.append( + { + "pangolin_lineage": i["key"], + "lineage_count": i["doc_count"], + "mutation_count": i["mutations"]["doc_count"], + } + ) df_response = pd.DataFrame(flattened_response) if df_response.shape[0] > 0: - prop = calculate_proportion(df_response["mutation_count"], df_response["lineage_count"]) + prop = calculate_proportion( + df_response["mutation_count"], df_response["lineage_count"] + ) df_response.loc[:, "proportion"] = prop[0] df_response.loc[:, "proportion_ci_lower"] = prop[1] df_response.loc[:, "proportion_ci_upper"] = prop[2]