Add validation rules to MutationsByLineage

newgene · Feb 16, 2023 · cebaa30 · cebaa30
1 parent 50333d6
commit cebaa30
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 36 deletions.
diff --git a/config_web/genomics.py b/config_web/genomics.py
@@ -19,6 +19,10 @@
         r"/{pre}/{ver}/prevalence-by-location-all-lineages",
         "web.handlers.v2.genomics.PrevalenceAllLineagesByLocationHandler",
     ),
+    (
+        r"/{pre}/{ver}/mutations-by-lineage",
+        "web.handlers.v2.genomics.MutationsByLineage",
+    ),
 ]
 
 APP_LIST = [

diff --git a/web/handlers/v2/genomics/__init__.py b/web/handlers/v2/genomics/__init__.py
@@ -2,5 +2,6 @@
 from .lineage import LineageHandler
 from .lineage_mutations import LineageMutationsHandler
 from .location import LocationHandler
-from .prevalence_by_location_and_time import PrevalenceByLocationAndTimeHandler
+from .mutations_by_lineage import MutationsByLineage
 from .prevalence_all_lineages_by_location import PrevalenceAllLineagesByLocationHandler
+from .prevalence_by_location_and_time import PrevalenceByLocationAndTimeHandler
diff --git a/web/handlers/v2/genomics/mutations_by_lineage.py b/web/handlers/v2/genomics/mutations_by_lineage.py
@@ -1,45 +1,61 @@
+import pandas as pd
+
+from web.handlers.genomics.base import BaseHandler
+from web.handlers.genomics.util import (
+    calculate_proportion,
+    create_nested_mutation_query,
+    parse_location_id_to_query,
+)
+
 
 class MutationsByLineage(BaseHandler):
-    @gen.coroutine
-    def _get(self):
-        query_location = self.get_argument("location_id", None)
-        query_mutations = self.get_argument("mutations", None)
-        query_pangolin_lineage = self.get_argument("pangolin_lineage", None)
-        query_mutations = [muts.split(",") for muts in query_mutations.split(" AND ")] if query_mutations is not None else []
-        query_frequency_threshold = self.get_argument("frequency", None)
-        query_frequency_threshold = float(query_frequency_threshold) if query_frequency_threshold is not None else 0
+    name = "mutations-by-lineage"
+    kwargs = dict(BaseHandler.kwargs)
+    kwargs["GET"] = {
+        "location_id": {"type": str, "default": None},
+        "mutations": {"type": str, "default": None},
+        "pangolin_lineage": {"type": str, "default": None},
+        "frequency": {"type": float, "default": 0, "min": 0, "max": 1},
+    }
+
+    async def _get(self):
+        query_location = self.args.location_id
+        query_mutations = self.args.mutations
+        query_pangolin_lineage = self.args.pangolin_lineage
+        query_mutations = (
+            [muts.split(",") for muts in query_mutations.split(" AND ")]
+            if query_mutations is not None
+            else []
+        )
+        query_frequency_threshold = self.args.frequency
         results = {}
-        for muts in query_mutations: # For multiple sets of mutations, create multiple ES queries. Since AND queries are possible doing one ES query with aggregations is cumbersome. Must look for better solution here.
+        for (
+            muts
+        ) in (
+            query_mutations
+        ):  # For multiple sets of mutations, create multiple ES queries. Since AND queries are possible doing one ES query with aggregations is cumbersome. Must look for better solution here.
             query = {
                 "size": 0,
                 "aggs": {
-	            "lineage": {
+                    "lineage": {
                         "terms": {"field": "pangolin_lineage", "size": self.size},
-                        "aggs": {
-                            "mutations": {
-                                "filter": {}
-			    }
-                        }
+                        "aggs": {"mutations": {"filter": {}}},
                     }
-                }
+                },
             }
             if query_location is not None:
                 query["query"] = parse_location_id_to_query(query_location)
             if query_pangolin_lineage is not None:
-                if "query" in query: # Only query added will be bool for location
-                    query["query"]["bool"]["must"].append({
-                        "term": {
-                            "pangolin_lineage": query_pangolin_lineage
-                        }
-                    })
+                if "query" in query:  # Only query added will be bool for location
+                    query["query"]["bool"]["must"].append(
+                        {"term": {"pangolin_lineage": query_pangolin_lineage}}
+                    )
                 else:
-                    query["query"] = {
-                        "term": {
-                            "pangolin_lineage": query_pangolin_lineage
-                        }
-                    }
-            query["aggs"]["lineage"]["aggs"]["mutations"]["filter"] = create_nested_mutation_query(mutations = muts)
-            resp = yield self.asynchronous_fetch(query)
+                    query["query"] = {"term": {"pangolin_lineage": query_pangolin_lineage}}
+            query["aggs"]["lineage"]["aggs"]["mutations"]["filter"] = create_nested_mutation_query(
+                mutations=muts
+            )
+            resp = await self.asynchronous_fetch(query)
             path_to_results = ["aggregations", "lineage", "buckets"]
             buckets = resp
             for i in path_to_results:
@@ -48,14 +64,18 @@ def _get(self):
             for i in buckets:
                 if not i["mutations"]["doc_count"] > 0 or i["key"] == "none":
                     continue
-                flattened_response.append({
-                    "pangolin_lineage": i["key"],
-                    "lineage_count": i["doc_count"],
-                    "mutation_count": i["mutations"]["doc_count"]
-                })
+                flattened_response.append(
+                    {
+                        "pangolin_lineage": i["key"],
+                        "lineage_count": i["doc_count"],
+                        "mutation_count": i["mutations"]["doc_count"],
+                    }
+                )
             df_response = pd.DataFrame(flattened_response)
             if df_response.shape[0] > 0:
-                prop = calculate_proportion(df_response["mutation_count"], df_response["lineage_count"])
+                prop = calculate_proportion(
+                    df_response["mutation_count"], df_response["lineage_count"]
+                )
                 df_response.loc[:, "proportion"] = prop[0]
                 df_response.loc[:, "proportion_ci_lower"] = prop[1]
                 df_response.loc[:, "proportion_ci_upper"] = prop[2]