outbreak-info · mindoftea · Mar 24, 2023 · Mar 27, 2023 · Jun 28, 2023 · Jul 26, 2023
diff --git a/requirements_web.txt b/requirements_web.txt
@@ -2,7 +2,7 @@
 # git+https://github.com/biothings/[email protected]#egg=biothings[web_extra]
 
 biothings[web_extra]==0.12.2
-pandas==1.4.3
+pandas==1.5.3
 pyjwt[crypto]==2.4.0
 scipy==1.9.0
 Jinja2==3.1.2

diff --git a/web/handlers/genomics/prevalence.py b/web/handlers/genomics/prevalence.py
@@ -264,7 +264,7 @@ def _get(self):
         )
         if query_window is not None:
             df_response = df_response[df_response["date"] >= (dt.now() - timedelta(days = query_window))]
-        df_response = get_major_lineage_prevalence(df_response, "date", query_other_exclude, query_other_threshold, query_nday_threshold, query_ndays)
+        df_response = get_major_lineage_prevalence(df_response, "date", None, None, query_other_exclude, query_other_threshold, query_nday_threshold, query_ndays)
         if not query_cumulative:
             df_response = df_response.groupby("lineage").apply(compute_rolling_mean_all_lineages, "date", "lineage_count", "lineage_count_rolling", "lineage").reset_index()
             df_response = df_response.groupby("date").apply(compute_total_count, "lineage_count_rolling", "total_count_rolling")
@@ -274,7 +274,7 @@ def _get(self):
             df_response = df_response.fillna("None")
             df_response = df_response[["date", "total_count", "lineage_count", "lineage", "prevalence", "prevalence_rolling"]]
         else:
-            df_response = df_response.groupby("lineage").apply(expand_dates, df_response["date"].min(), df_response["date"].max(), "date", "lineage").reset_index()
+            df_response = df_response.groupby("lineage", group_keys=False).apply(expand_dates, df_response["date"].min(), df_response["date"].max(), "date", "lineage").reset_index()
             df_response = df_response.groupby("date").apply(compute_total_count, "lineage_count", "total_count").reset_index()
             df_response = df_response.groupby("lineage").agg({"total_count": "sum", "lineage_count": "sum"}).reset_index()
             df_response.loc[:,"prevalence"] = df_response["lineage_count"]/df_response["total_count"]

diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py
@@ -19,7 +19,6 @@ def expand_dates(df, date_min, date_max, index_col, grp_col):
         df
         .set_index(index_col)
         .reindex(idx, fill_value = 0)
-        .drop(grp_col, axis = 1)
         .reset_index()
         .rename(
             columns = {
@@ -205,9 +204,14 @@ def classify_other_category(grp, keep_lineages):
     })
     return grp
 
-def get_major_lineage_prevalence(df, index_col, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180):
+def get_major_lineage_prevalence(df, index_col, min_date = None, max_date = None, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180):
     date_limit = dt.today() - timedelta(days = ndays)
-    lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df["date"] >= date_limit)]["lineage"].value_counts()
+    lineages_to_retain = df[
+        (df["prevalence"] >= prevalence_threshold)
+        & (df["date"] >= date_limit)
+        & ((min_date is None) | (df["date"] >= min_date))
+        & ((max_date is None) | (df["date"] <= max_date))
+    ]["lineage"].value_counts()
     num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
     if num_unique_dates < nday_threshold:
         nday_threshold = round((nday_threshold/ndays) * num_unique_dates)

diff --git a/web/handlers/v2/genomics/prevalence_all_lineages_by_location.py b/web/handlers/v2/genomics/prevalence_all_lineages_by_location.py
@@ -41,6 +41,8 @@ async def _get(self):
             query_other_exclude.split(",") if query_other_exclude is not None else []
         )
         query_cumulative = self.args.cumulative
+        min_date = self.args.min_date
+        max_date = self.args.max_date
         query = {
             "size": 0,
             "aggs": {
@@ -54,7 +56,7 @@ async def _get(self):
         }
         query_obj = parse_location_id_to_query(query_location)
         date_range_filter = create_date_range_filter(
-            "date_collected", self.args.min_date, self.args.max_date
+            "date_collected", min_date, max_date
         )
         query_obj = parse_time_window_to_query(date_range_filter, query_obj=query_obj)
         if query_obj:
@@ -98,6 +100,8 @@ async def _get(self):
         df_response = get_major_lineage_prevalence(
             df_response,
             "date",
+            min_date,
+            max_date,
             query_other_exclude,
             query_other_threshold,
             query_nday_threshold,
@@ -138,7 +142,7 @@ async def _get(self):
             ]
         else:
             df_response = (
-                df_response.groupby("lineage")
+                df_response.groupby("lineage", group_keys=False)
                 .apply(
                     expand_dates,
                     df_response["date"].min(),