From 6a8155ec0b227da9907fb709c909afdb35571375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ann-Kathrin=20Br=C3=BCggemann?= <90249112+AKBrueggemann@users.noreply.github.com> Date: Tue, 4 Jan 2022 08:43:04 +0100 Subject: [PATCH] fix: relative aggregation sample plot (#412) * Changed plots occurence threshold to relative * fmt * Name changed: number of occurences in plot * fmt * Changed threshold values for variant/lineage plots * Changed way of thresholding lineages/variants * fmt * Changed name for other occ. in plot Co-authored-by: Thomas Battenfeld <46334240+thomasbtf@users.noreply.github.com> --- workflow/scripts/plot-lineages-over-time.py | 15 +++++++++++---- workflow/scripts/plot-variants-over-time.py | 14 ++++++++++---- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/workflow/scripts/plot-lineages-over-time.py b/workflow/scripts/plot-lineages-over-time.py index bea202b12..17fd019f3 100644 --- a/workflow/scripts/plot-lineages-over-time.py +++ b/workflow/scripts/plot-lineages-over-time.py @@ -34,10 +34,17 @@ def plot_lineages_over_time(sm_input, sm_output, dates, sm_output_table): pangolin_calls["lineage_count"] = pd.Series() # mask low occurrences - threshold = len(pangolin_calls) / 10 - pangolin_calls.loc[pangolin_calls["lineage_count"] < threshold, "lineage"] = ( - "other (<" + str(threshold) + " occ.)" - ) + print(pangolin_calls["lineage"].value_counts()) + df = pd.DataFrame(pangolin_calls["lineage"].value_counts()) + df.sort_values(by=["lineage"]) + if len(df.index) > 10: + pangolin_calls.loc[ + ~df.head(10).isin(pangolin_calls["lineage"]), "lineage" + ] = "other occ." + else: + pangolin_calls.loc[ + pangolin_calls["lineage_count"] < 0, "lineage" + ] = "other occ." pangolin_calls.rename(columns={"lineage": "Lineage", "date": "Date"}, inplace=True) diff --git a/workflow/scripts/plot-variants-over-time.py b/workflow/scripts/plot-variants-over-time.py index 4be2b776a..7520402cc 100644 --- a/workflow/scripts/plot-variants-over-time.py +++ b/workflow/scripts/plot-variants-over-time.py @@ -84,10 +84,16 @@ def plot_variants_over_time(sm_output, sm_output_table): ].transform(lambda s: s.count()) # mask low occurrences - threshold = len(calls) / 10 - calls.loc[calls["total occurrence"] < threshold, "alteration"] = ( - "other (<" + str(threshold) + " occ.)" - ) + print(calls["alteration"].value_counts()) + df = pd.DataFrame(calls["alteration"].value_counts()) + df.sort_values(by=["alteration"]) + if len(df.index) > 10: + # print(calls.loc[calls["alteration"].isin(df.head(10).index)]) + calls.loc[ + ~calls["alteration"].isin(df.head(10).index), "alteration" + ] = "other occ." + else: + calls.loc[calls["total occurrence"] < 0, "alteration"] = "other occ." calls.rename(columns={"alteration": "Alteration", "date": "Date"}, inplace=True)