From 82405ee8e1d0eddcf40315888ef3a32eda7b3954 Mon Sep 17 00:00:00 2001 From: Alexander Thomas <77535027+alethomas@users.noreply.github.com> Date: Thu, 3 Mar 2022 21:44:40 +0100 Subject: [PATCH] fix: VOC mutation table similarity calculation (#484) * refactor voc table generation * fmt * top 10 and minor fix * add formatting for read depth column * add read depth, fix prob calc * fmt * fix formatting, add exp notation * Update workflow/scripts/generate-lineage-variant-table.py Co-authored-by: Thomas Battenfeld <46334240+thomasbtf@users.noreply.github.com> * fmt Co-authored-by: Thomas Battenfeld <46334240+thomasbtf@users.noreply.github.com> --- resources/lineage-variant-table-formatter.js | 64 +++++- .../scripts/generate-lineage-variant-table.py | 191 ++++++++++-------- 2 files changed, 167 insertions(+), 88 deletions(-) diff --git a/resources/lineage-variant-table-formatter.js b/resources/lineage-variant-table-formatter.js index 706af1a6a..c980d5682 100644 --- a/resources/lineage-variant-table-formatter.js +++ b/resources/lineage-variant-table-formatter.js @@ -101,12 +101,44 @@ } }, Frequency: function (vaf) { - var lighting = (0.9 - parseFloat(vaf) * 0.4) * 100; - return `${vaf}`; + vaf = parseFloat(vaf) + if (!isNaN(vaf)) { + var lighting = (0.9 - vaf * 0.4) * 100; + if (vaf < 0.1) { + vaf = vaf.toExponential(2) + } else { + vaf = vaf.toFixed(2) + } + return `${vaf}`; + } else { + return " " + } }, Probability: function (prob) { - var lighting = (0.9 - parseFloat(prob) * 0.4) * 100; - return `${prob}`; + prob = parseFloat(prob) + if (!isNaN(prob)) { + var lighting = (0.9 - prob * 0.4) * 100; + if (prob < 0.1) { + prob = prob.toExponential(2) + } else { + prob = prob.toFixed(2) + } + return `${prob}`; + } else { + return " " + } + }, + ReadDepth: function (depth) { + depth = parseInt(depth) + if (!isNaN(depth)) { + if (depth < 10) { + return `${depth}`; + } else { + return `${depth}`; + } + } else { + return " " + } }, "lineage helper": function (value) { if (isNaN(value)) { @@ -140,7 +172,7 @@ if (value == "x") { return `${"\u2713"}`; } else { - const match = /^(?.{3})\s(?.+)\s?.*$/i.exec(value); + const match = /^(?.{1,3})\s(?.+)\s?.*$/i.exec(value); var parent = match.groups?.parent; var version = match.groups?.version; @@ -162,7 +194,7 @@ } } }, - "Highest similarity": function (value) { + "1st": function (value) { let result = this["lineage helper"](value); return result; }, @@ -182,4 +214,24 @@ let result = this["lineage helper"](value); return result; }, + "6th": function (value) { + let result = this["lineage helper"](value); + return result; + }, + "7th": function (value) { + let result = this["lineage helper"](value); + return result; + }, + "8th": function (value) { + let result = this["lineage helper"](value); + return result; + }, + "9th": function (value) { + let result = this["lineage helper"](value); + return result; + }, + "10th": function (value) { + let result = this["lineage helper"](value); + return result; + }, }; diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py index b8e8876e8..6784c48a0 100644 --- a/workflow/scripts/generate-lineage-variant-table.py +++ b/workflow/scripts/generate-lineage-variant-table.py @@ -16,14 +16,39 @@ def phred_to_prob(phred): if phred is None: - return 0 + return pd.NA return 10 ** (-phred / 10) +# np.prod returns 1's as values for a pd series with NaN's. A list would return NaN's +def prod_prob_not_present(probs): + if pd.isna(probs).any(): + return pd.NA + else: + return np.prod(probs) + + def has_numbers(inputString): return any(char.isdigit() for char in inputString) +def add_number_suffix(number): + number = str(number) + + if number.endswith("1") and number != "11": + return f"{number}st" + elif number.endswith("2") and number != "12": + return f"{number}nd" + elif number.endswith("3") and number != "13": + return f"{number}rd" + else: + return f"{number}th" + + +def rename_enumeration(list_length): + return [add_number_suffix(x) for x in range(1, list_length + 1)] + + variants_df = pd.DataFrame() lineage_df = pd.DataFrame() @@ -33,9 +58,12 @@ def has_numbers(inputString): if "SIGNATURES" in record.info: signatures = record.info.get("SIGNATURES", ("#ERROR0",)) vaf = record.samples[0]["AF"][0] + dp = record.samples[0]["DP"] prob_not_present = phred_to_prob( record.info["PROB_ABSENT"][0] ) + phred_to_prob(record.info["PROB_ARTIFACT"][0]) + if pd.isna(prob_not_present): + vaf = pd.NA lineages = record.info["LINEAGES"] for signature in signatures: # generate df with all signatures + VAF and Prob_not_present from calculation @@ -43,6 +71,7 @@ def has_numbers(inputString): { "Mutations": signature, "Frequency": vaf, + "ReadDepth": dp, "Prob_not_present": prob_not_present, }, ignore_index=True, @@ -56,27 +85,15 @@ def has_numbers(inputString): ignore_index=True, ) -# count occurences of signatures (x) in lineage columns and get sorted list -lineage_dict = dict(lineage_df.count()) -lineage_dict = dict( - sorted(lineage_dict.items(), key=lambda item: item[1], reverse=True) -) -top5_lineages = list(lineage_dict.keys()) - -# only include variant names (index=0) + top 5 variants (index=1-6) and reorder -lineage_df.drop(labels=top5_lineages[7:], axis=1, inplace=True) -lineage_df = lineage_df[top5_lineages[:7]] - # aggregate both dataframes by summing up repeating rows for VAR (maximum=1) and multiply Prob_not_present -variants_df = ( - variants_df.groupby(["Mutations"]) - .agg( - func={"Frequency": lambda x: min(sum(x), 1.0), "Prob_not_present": np.prod}, - axis=1, - ) - .reset_index() +variants_df = variants_df.groupby(["Mutations"]).agg( + func={ + "Frequency": lambda x: min(sum(x), 1.0), + "Prob_not_present": prod_prob_not_present, + "ReadDepth": np.min, + }, + axis=1, ) -pd.set_option("display.max_rows", None) # new column for 1-prob_not_present = prob_present variants_df["Probability"] = 1.0 - variants_df["Prob_not_present"] @@ -88,35 +105,40 @@ def has_numbers(inputString): lineage_df = lineage_df.replace({"x": 1}) lineage_df = ( lineage_df.groupby(["Mutations"]) - .agg(func={column: np.max for column in top5_lineages[:7]}) + .agg(func={column: np.max for column in lineage_df.columns}) .reset_index(drop=True) ) lineage_df = lineage_df.replace({1: "x", 0: ""}) -# calculate Jaccard coefficient for top 5 lineages and save row as df to append after sorting +# calculate Jaccard coefficient for each lineage +# iterate over lineages in columns (mutations as index) +lineage_df.set_index("Mutations", inplace=True) jaccard_coefficient = {} -for lineage in range(1, len(top5_lineages[:6])): - jaccard_coefficient[top5_lineages[lineage]] = round( - variants_df[ - variants_df["Mutations"].isin( - lineage_df[lineage_df[top5_lineages[lineage]] == "x"]["Mutations"] - ) - ]["Prob X VAF"].sum() - / variants_df["Prob X VAF"].sum(), +for lineage in lineage_df.columns: + lineage_defining_variants = variants_df.index.isin( + lineage_df.index[lineage_df[lineage] == "x"] + ) + lineage_defining_non_variants = ~lineage_defining_variants + jaccard_coefficient[lineage] = round( + ( + variants_df[lineage_defining_variants]["Prob X VAF"].sum() + + variants_df[lineage_defining_non_variants]["Prob_not_present"].sum() + ) + / len(variants_df), 3, ) + jaccard_row = pd.DataFrame( {"Mutations": "Similarity", **jaccard_coefficient}, index=[0] ) - # merge variants dataframe and lineage dataframe -variants_df = variants_df.merge(lineage_df, left_on="Mutations", right_on="Mutations") +variants_df = variants_df.merge(lineage_df, left_index=True, right_index=True) # add feature column for sorting -variants_df["Features"] = variants_df["Mutations"].str.extract(r"(.+)[:].+|\*") +variants_df["Features"] = variants_df.index.to_series().str.extract(r"(.+)[:].+|\*") # position of variant for sorting and change type -variants_df["Position"] = variants_df["Mutations"].str.extract( +variants_df["Position"] = variants_df.index.to_series().str.extract( r"(.*:?[A-Z]+|\*$|-)([0-9]+)([A-Z]+$|\*$|-)$" )[1] variants_df = variants_df.astype({"Position": "int64"}) @@ -128,70 +150,75 @@ def has_numbers(inputString): sorterIndex = dict(zip(sorter, range(len(sorter)))) variants_df["Features_Rank"] = variants_df["Features"].map(sorterIndex) -# define categories for sorting -variants_df.loc[ - (variants_df[top5_lineages[1]] == "x") & (variants_df["Probability"] >= 0.95), - "Order", -] = 0 -variants_df.loc[ - (variants_df[top5_lineages[1]] == "x") & (variants_df["Probability"] <= 0.05), - "Order", -] = 1 -variants_df.loc[ - (variants_df[top5_lineages[1]] != "x") & (variants_df["Probability"] >= 0.95), - "Order", -] = 2 -variants_df.loc[ - (variants_df[top5_lineages[1]] == "x") - & ((variants_df["Probability"] > 0.05) & (variants_df["Probability"] < 0.95)), - "Order", -] = 3 -variants_df.loc[ - (variants_df[top5_lineages[1]] != "x") & (variants_df["Probability"] <= 0.05), - "Order", -] = 4 -variants_df.loc[ - (variants_df[top5_lineages[1]] != "x") - & ((variants_df["Probability"] > 0.05) & (variants_df["Probability"] < 0.95)), - "Order", -] = 5 - -top5_lineages_row_df = pd.DataFrame( - {"Mutations": "Lineage", **{x: x for x in top5_lineages[1:6]}}, index=[0] -) - -# sort final DF -variants_df["Prob X VAF"].replace([0, 0.0], np.NaN, inplace=True) -variants_df.sort_values( - by=["Order", "Features_Rank", "Position"], - ascending=[True, True, True], - na_position="last", - inplace=True, +# row for lineage name after renaming columns (column names can't be formatted) +lineages_row_df = pd.DataFrame( + { + "Mutations": "Lineage", + **{x: x for x in list(lineage_df.columns) if x != "Mutations"}, + }, + index=[0], ) # concat row with Jaccard coefficient, drop unneccesary columns, sort with Jaccard coefficient, round -variants_df = pd.concat([jaccard_row, variants_df]).reset_index(drop=True) -variants_df = pd.concat([top5_lineages_row_df, variants_df]).reset_index(drop=True) -variants_df = variants_df[ - ["Mutations", "Probability", "Frequency", *top5_lineages[1:6]] -] -variants_df = variants_df.round({"Probability": 5, "Frequency": 5}) +variants_df.reset_index(inplace=True) +variants_df = pd.concat([jaccard_row, variants_df]) +variants_df = pd.concat([lineages_row_df, variants_df]) + + +variants_df = variants_df.round({"Probability": 2, "Frequency": 2}) variants_df.set_index("Mutations", inplace=True) variants_df.sort_values( by="Similarity", axis=1, na_position="first", ascending=False, inplace=True ) -# rename top 5 hits +# rename hits ascending variants_df.rename( columns={ x: y for x, y in zip( - list(variants_df.columns)[2:], - ["Highest similarity", "2nd", "3rd", "4th", "5th"], + list(variants_df.columns)[8:], + rename_enumeration(len(list(variants_df.columns)[8:])), ) }, errors="raise", inplace=True, ) +# sort final DF +variants_df.loc[ + variants_df["1st"] == "x", + "Order", +] = 1 +variants_df.loc[ + variants_df["1st"] != "x", + "Order", +] = 2 +variants_df.at["Similarity", "Order"] = 0 +variants_df.at["Lineage", "Order"] = 0 +variants_df["Prob X VAF"].replace([0, 0.0], np.NaN, inplace=True) +variants_df.sort_values( + by=["Order", "Features_Rank", "Position"], + ascending=[True, True, True], + na_position="last", + inplace=True, +) +# drop unwanted columns +variants_df.drop( + columns=[ + "Prob_not_present", + "Prob X VAF", + "Features", + "Position", + "Features_Rank", + "Order", + ], + inplace=True, +) +all_columns = variants_df.columns +first_columns = ["Probability", "Frequency", "ReadDepth"] +rest_columns = [item for item in all_columns if item not in first_columns] +variants_df = variants_df[[*first_columns, *rest_columns]] + +# drop other lineages, top 10 only +variants_df.drop(variants_df.columns[13:], axis=1, inplace=True) # output variant_df variants_df.to_csv(snakemake.output.variant_table, index=True, sep=",")