diff --git a/requirements_web.txt b/requirements_web.txt index 356804a2..ff6dc0ce 100644 --- a/requirements_web.txt +++ b/requirements_web.txt @@ -2,7 +2,7 @@ # git+https://github.com/biothings/biothings.api@0.12.x#egg=biothings[web_extra] biothings[web_extra]==0.12.2 -pandas==1.4.3 +pandas==1.5.3 pyjwt[crypto]==2.4.0 scipy==1.9.0 Jinja2==3.1.2 diff --git a/web/handlers/genomics/prevalence.py b/web/handlers/genomics/prevalence.py index 520d7b30..77e390b9 100644 --- a/web/handlers/genomics/prevalence.py +++ b/web/handlers/genomics/prevalence.py @@ -264,7 +264,7 @@ def _get(self): ) if query_window is not None: df_response = df_response[df_response["date"] >= (dt.now() - timedelta(days = query_window))] - df_response = get_major_lineage_prevalence(df_response, "date", query_other_exclude, query_other_threshold, query_nday_threshold, query_ndays) + df_response = get_major_lineage_prevalence(df_response, "date", None, None, query_other_exclude, query_other_threshold, query_nday_threshold, query_ndays) if not query_cumulative: df_response = df_response.groupby("lineage").apply(compute_rolling_mean_all_lineages, "date", "lineage_count", "lineage_count_rolling", "lineage").reset_index() df_response = df_response.groupby("date").apply(compute_total_count, "lineage_count_rolling", "total_count_rolling") @@ -274,7 +274,7 @@ def _get(self): df_response = df_response.fillna("None") df_response = df_response[["date", "total_count", "lineage_count", "lineage", "prevalence", "prevalence_rolling"]] else: - df_response = df_response.groupby("lineage").apply(expand_dates, df_response["date"].min(), df_response["date"].max(), "date", "lineage").reset_index() + df_response = df_response.groupby("lineage", group_keys=False).apply(expand_dates, df_response["date"].min(), df_response["date"].max(), "date", "lineage").reset_index() df_response = df_response.groupby("date").apply(compute_total_count, "lineage_count", "total_count").reset_index() df_response = df_response.groupby("lineage").agg({"total_count": "sum", "lineage_count": "sum"}).reset_index() df_response.loc[:,"prevalence"] = df_response["lineage_count"]/df_response["total_count"] diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py index 2eaeb36d..8c2be2d3 100644 --- a/web/handlers/genomics/util.py +++ b/web/handlers/genomics/util.py @@ -19,7 +19,6 @@ def expand_dates(df, date_min, date_max, index_col, grp_col): df .set_index(index_col) .reindex(idx, fill_value = 0) - .drop(grp_col, axis = 1) .reset_index() .rename( columns = { @@ -205,9 +204,14 @@ def classify_other_category(grp, keep_lineages): }) return grp -def get_major_lineage_prevalence(df, index_col, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180): +def get_major_lineage_prevalence(df, index_col, min_date = None, max_date = None, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180): date_limit = dt.today() - timedelta(days = ndays) - lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df["date"] >= date_limit)]["lineage"].value_counts() + lineages_to_retain = df[ + (df["prevalence"] >= prevalence_threshold) + & (df["date"] >= date_limit) + & ((min_date is None) | (df["date"] >= min_date)) + & ((max_date is None) | (df["date"] <= max_date)) + ]["lineage"].value_counts() num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] if num_unique_dates < nday_threshold: nday_threshold = round((nday_threshold/ndays) * num_unique_dates) diff --git a/web/handlers/v2/genomics/prevalence_all_lineages_by_location.py b/web/handlers/v2/genomics/prevalence_all_lineages_by_location.py index 3c4318fe..ea5c31cc 100644 --- a/web/handlers/v2/genomics/prevalence_all_lineages_by_location.py +++ b/web/handlers/v2/genomics/prevalence_all_lineages_by_location.py @@ -41,6 +41,8 @@ async def _get(self): query_other_exclude.split(",") if query_other_exclude is not None else [] ) query_cumulative = self.args.cumulative + min_date = self.args.min_date + max_date = self.args.max_date query = { "size": 0, "aggs": { @@ -54,7 +56,7 @@ async def _get(self): } query_obj = parse_location_id_to_query(query_location) date_range_filter = create_date_range_filter( - "date_collected", self.args.min_date, self.args.max_date + "date_collected", min_date, max_date ) query_obj = parse_time_window_to_query(date_range_filter, query_obj=query_obj) if query_obj: @@ -98,6 +100,8 @@ async def _get(self): df_response = get_major_lineage_prevalence( df_response, "date", + min_date, + max_date, query_other_exclude, query_other_threshold, query_nday_threshold, @@ -138,7 +142,7 @@ async def _get(self): ] else: df_response = ( - df_response.groupby("lineage") + df_response.groupby("lineage", group_keys=False) .apply( expand_dates, df_response["date"].min(),