Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validation rules to MutationsByLineage #11

Merged
merged 2 commits into from
Feb 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config_web/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
r"/{pre}/{ver}/prevalence-by-position",
"web.handlers.v2.genomics.PrevalenceByAAPositionHandler",
),
(
r"/{pre}/{ver}/mutations-by-lineage",
"web.handlers.v2.genomics.MutationsByLineage",
),
]

APP_LIST_SWITCHED_TO_V2 = [
Expand Down
1 change: 1 addition & 0 deletions web/handlers/v2/genomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
from .location import LocationHandler
from .prevalence_all_lineages_by_location import PrevalenceAllLineagesByLocationHandler
from .prevalence_by_aa_position import PrevalenceByAAPositionHandler
from .mutations_by_lineage import MutationsByLineage
from .prevalence_by_location_and_time import PrevalenceByLocationAndTimeHandler
85 changes: 85 additions & 0 deletions web/handlers/v2/genomics/mutations_by_lineage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pandas as pd

from web.handlers.genomics.base import BaseHandler
from web.handlers.genomics.util import (
calculate_proportion,
create_nested_mutation_query,
parse_location_id_to_query,
)


class MutationsByLineage(BaseHandler):
name = "mutations-by-lineage"
kwargs = dict(BaseHandler.kwargs)
kwargs["GET"] = {
"location_id": {"type": str, "default": None},
"mutations": {"type": str, "default": None},
"pangolin_lineage": {"type": str, "default": None},
"frequency": {"type": float, "default": 0, "min": 0, "max": 1},
}

async def _get(self):
query_location = self.args.location_id
query_mutations = self.args.mutations
query_pangolin_lineage = self.args.pangolin_lineage
query_mutations = (
[muts.split(",") for muts in query_mutations.split(" AND ")]
if query_mutations is not None
else []
)
query_frequency_threshold = self.args.frequency
results = {}
for (
muts
) in (
query_mutations
): # For multiple sets of mutations, create multiple ES queries. Since AND queries are possible doing one ES query with aggregations is cumbersome. Must look for better solution here.
query = {
"size": 0,
"aggs": {
"lineage": {
"terms": {"field": "pangolin_lineage", "size": self.size},
"aggs": {"mutations": {"filter": {}}},
}
},
}
if query_location is not None:
query["query"] = parse_location_id_to_query(query_location)
if query_pangolin_lineage is not None:
if "query" in query: # Only query added will be bool for location
query["query"]["bool"]["must"].append(
{"term": {"pangolin_lineage": query_pangolin_lineage}}
)
else:
query["query"] = {"term": {"pangolin_lineage": query_pangolin_lineage}}
query["aggs"]["lineage"]["aggs"]["mutations"]["filter"] = create_nested_mutation_query(
mutations=muts
)
resp = await self.asynchronous_fetch(query)
path_to_results = ["aggregations", "lineage", "buckets"]
buckets = resp
for i in path_to_results:
buckets = buckets[i]
flattened_response = []
for i in buckets:
if not i["mutations"]["doc_count"] > 0 or i["key"] == "none":
continue
flattened_response.append(
{
"pangolin_lineage": i["key"],
"lineage_count": i["doc_count"],
"mutation_count": i["mutations"]["doc_count"],
}
)
df_response = pd.DataFrame(flattened_response)
if df_response.shape[0] > 0:
prop = calculate_proportion(
df_response["mutation_count"], df_response["lineage_count"]
)
df_response.loc[:, "proportion"] = prop[0]
df_response.loc[:, "proportion_ci_lower"] = prop[1]
df_response.loc[:, "proportion_ci_upper"] = prop[2]
df_response = df_response[df_response["proportion"] >= query_frequency_threshold]
results[",".join(muts)] = df_response.to_dict(orient="records")
resp = {"success": True, "results": results}
return resp