Skip to content

Commit

Permalink
Merge pull request #6 from newgene/prevalence-by-position
Browse files Browse the repository at this point in the history
Add validation rules to PrevalenceByAAPositionHandler
  • Loading branch information
newgene authored Feb 22, 2023
2 parents d68f9e3 + 16e87b0 commit d1156bf
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 1 deletion.
4 changes: 4 additions & 0 deletions config_web/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
r"/{pre}/{ver}/prevalence-by-location-all-lineages",
"web.handlers.v2.genomics.PrevalenceAllLineagesByLocationHandler",
),
(
r"/{pre}/{ver}/prevalence-by-position",
"web.handlers.v2.genomics.PrevalenceByAAPositionHandler",
),
]

APP_LIST = [
Expand Down
3 changes: 2 additions & 1 deletion web/handlers/v2/genomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
from .lineage import LineageHandler
from .lineage_mutations import LineageMutationsHandler
from .location import LocationHandler
from .prevalence_by_location_and_time import PrevalenceByLocationAndTimeHandler
from .prevalence_all_lineages_by_location import PrevalenceAllLineagesByLocationHandler
from .prevalence_by_aa_position import PrevalenceByAAPositionHandler
from .prevalence_by_location_and_time import PrevalenceByLocationAndTimeHandler
145 changes: 145 additions & 0 deletions web/handlers/v2/genomics/prevalence_by_aa_position.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import pandas as pd

from web.handlers.genomics.base import BaseHandler
from web.handlers.genomics.util import compute_rolling_mean, parse_location_id_to_query


class PrevalenceByAAPositionHandler(BaseHandler):
name = "prevalence-by-position"
kwargs = dict(BaseHandler.kwargs)
kwargs["GET"] = {
"pangolin_lineage": {"type": str, "default": None},
"location_id": {"type": str, "default": None},
"name": {"type": str, "required": True}, # should be deprecated
"gene": {
"type": str,
"required": False,
}, # replacement of name parameter for validator applying
"position": {
"type": int,
"required": False,
}, # replacement of name parameter for validator applying
}

async def _get(self):
query_str = self.args.name
query_location = self.args.location_id
query_lineage = self.args.pangolin_lineage
query_gene = self.args.gene
if not query_gene:
query_gene = query_str.split(":")[0]
query_aa_position = self.args.position
if query_aa_position is None:
query_aa_position = int(query_str.split(":")[1])
# Get ref codon
query = {
"size": 0,
"aggs": {
"by_mutations": {
"nested": {"path": "mutations"},
"aggs": {
"inner": {
"filter": {
"bool": {
"must": [
{"match": {"mutations.codon_num": query_aa_position}},
{"match": {"mutations.gene": query_gene}},
]
}
},
"aggs": {"by_nested": {"top_hits": {"size": 1}}},
}
},
}
},
}
resp = await self.asynchronous_fetch(query)
tmp_ref = resp["aggregations"]["by_mutations"]["inner"]["by_nested"]["hits"]["hits"]
dict_response = []
if len(tmp_ref) > 0:
ref_aa = tmp_ref[0]["_source"]["ref_aa"]
query = {
"aggs": {
"by_date": {
"terms": {"field": "date_collected", "size": self.size},
"aggs": {
"by_mutations": {
"nested": {"path": "mutations"},
"aggs": {
"inner": {
"filter": {
"bool": {
"must": [
{
"match": {
"mutations.codon_num": query_aa_position
}
},
{"match": {"mutations.gene": query_gene}},
]
}
},
"aggs": {
"by_name": {"terms": {"field": "mutations.alt_aa"}}
},
}
},
}
},
}
}
}
if query_location is not None:
query["query"] = parse_location_id_to_query(
query_location, query["aggs"]["prevalence"]["filter"]
)
if query_lineage is not None:
if "query" in query:
query["query"]["bool"]["must"].append(
{"term": {"pangolin_lineage": query_lineage}}
)
else:
query["query"] = {"term": {"pangolin_lineage": query_lineage}}
resp = await self.asynchronous_fetch(query)
buckets = resp
path_to_results = ["aggregations", "by_date", "buckets"]
for i in path_to_results:
buckets = buckets[i]
flattened_response = []
for d in buckets:
alt_count = 0
for m in d["by_mutations"]["inner"]["by_name"]["buckets"]:
if m["key"] == "None":
continue
flattened_response.append(
{
"date": d["key"],
"total_count": d["doc_count"],
"aa": m["key"],
"aa_count": m["doc_count"],
}
)
alt_count += m["doc_count"]
flattened_response.append(
{
"date": d["key"],
"total_count": d["doc_count"],
"aa": ref_aa,
"aa_count": d["doc_count"] - alt_count,
}
)
df_response = (
pd.DataFrame(flattened_response)
.assign(
date=lambda x: pd.to_datetime(x["date"], format="%Y-%m-%d"),
prevalence=lambda x: x["aa_count"] / x["total_count"],
)
.sort_values("date")
)
df_response = df_response.groupby("aa").apply(
compute_rolling_mean, "date", "prevalence", "prevalence_rolling"
)
df_response.loc[:, "date"] = df_response["date"].apply(lambda x: x.strftime("%Y-%m-%d"))
dict_response = df_response.to_dict(orient="records")
resp = {"success": True, "results": dict_response}
return resp

0 comments on commit d1156bf

Please sign in to comment.