From 82405ee8e1d0eddcf40315888ef3a32eda7b3954 Mon Sep 17 00:00:00 2001
From: Alexander Thomas <77535027+alethomas@users.noreply.github.com>
Date: Thu, 3 Mar 2022 21:44:40 +0100
Subject: [PATCH] fix: VOC mutation table similarity calculation (#484)
* refactor voc table generation
* fmt
* top 10 and minor fix
* add formatting for read depth column
* add read depth, fix prob calc
* fmt
* fix formatting, add exp notation
* Update workflow/scripts/generate-lineage-variant-table.py
Co-authored-by: Thomas Battenfeld <46334240+thomasbtf@users.noreply.github.com>
* fmt
Co-authored-by: Thomas Battenfeld <46334240+thomasbtf@users.noreply.github.com>
---
resources/lineage-variant-table-formatter.js | 64 +++++-
.../scripts/generate-lineage-variant-table.py | 191 ++++++++++--------
2 files changed, 167 insertions(+), 88 deletions(-)
diff --git a/resources/lineage-variant-table-formatter.js b/resources/lineage-variant-table-formatter.js
index 706af1a6a..c980d5682 100644
--- a/resources/lineage-variant-table-formatter.js
+++ b/resources/lineage-variant-table-formatter.js
@@ -101,12 +101,44 @@
}
},
Frequency: function (vaf) {
- var lighting = (0.9 - parseFloat(vaf) * 0.4) * 100;
- return `${vaf}`;
+ vaf = parseFloat(vaf)
+ if (!isNaN(vaf)) {
+ var lighting = (0.9 - vaf * 0.4) * 100;
+ if (vaf < 0.1) {
+ vaf = vaf.toExponential(2)
+ } else {
+ vaf = vaf.toFixed(2)
+ }
+ return `${vaf}`;
+ } else {
+ return " "
+ }
},
Probability: function (prob) {
- var lighting = (0.9 - parseFloat(prob) * 0.4) * 100;
- return `${prob}`;
+ prob = parseFloat(prob)
+ if (!isNaN(prob)) {
+ var lighting = (0.9 - prob * 0.4) * 100;
+ if (prob < 0.1) {
+ prob = prob.toExponential(2)
+ } else {
+ prob = prob.toFixed(2)
+ }
+ return `${prob}`;
+ } else {
+ return " "
+ }
+ },
+ ReadDepth: function (depth) {
+ depth = parseInt(depth)
+ if (!isNaN(depth)) {
+ if (depth < 10) {
+ return `${depth}`;
+ } else {
+ return `${depth}`;
+ }
+ } else {
+ return " "
+ }
},
"lineage helper": function (value) {
if (isNaN(value)) {
@@ -140,7 +172,7 @@
if (value == "x") {
return `${"\u2713"}`;
} else {
- const match = /^(?.{3})\s(?.+)\s?.*$/i.exec(value);
+ const match = /^(?.{1,3})\s(?.+)\s?.*$/i.exec(value);
var parent = match.groups?.parent;
var version = match.groups?.version;
@@ -162,7 +194,7 @@
}
}
},
- "Highest similarity": function (value) {
+ "1st": function (value) {
let result = this["lineage helper"](value);
return result;
},
@@ -182,4 +214,24 @@
let result = this["lineage helper"](value);
return result;
},
+ "6th": function (value) {
+ let result = this["lineage helper"](value);
+ return result;
+ },
+ "7th": function (value) {
+ let result = this["lineage helper"](value);
+ return result;
+ },
+ "8th": function (value) {
+ let result = this["lineage helper"](value);
+ return result;
+ },
+ "9th": function (value) {
+ let result = this["lineage helper"](value);
+ return result;
+ },
+ "10th": function (value) {
+ let result = this["lineage helper"](value);
+ return result;
+ },
};
diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py
index b8e8876e8..6784c48a0 100644
--- a/workflow/scripts/generate-lineage-variant-table.py
+++ b/workflow/scripts/generate-lineage-variant-table.py
@@ -16,14 +16,39 @@
def phred_to_prob(phred):
if phred is None:
- return 0
+ return pd.NA
return 10 ** (-phred / 10)
+# np.prod returns 1's as values for a pd series with NaN's. A list would return NaN's
+def prod_prob_not_present(probs):
+ if pd.isna(probs).any():
+ return pd.NA
+ else:
+ return np.prod(probs)
+
+
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)
+def add_number_suffix(number):
+ number = str(number)
+
+ if number.endswith("1") and number != "11":
+ return f"{number}st"
+ elif number.endswith("2") and number != "12":
+ return f"{number}nd"
+ elif number.endswith("3") and number != "13":
+ return f"{number}rd"
+ else:
+ return f"{number}th"
+
+
+def rename_enumeration(list_length):
+ return [add_number_suffix(x) for x in range(1, list_length + 1)]
+
+
variants_df = pd.DataFrame()
lineage_df = pd.DataFrame()
@@ -33,9 +58,12 @@ def has_numbers(inputString):
if "SIGNATURES" in record.info:
signatures = record.info.get("SIGNATURES", ("#ERROR0",))
vaf = record.samples[0]["AF"][0]
+ dp = record.samples[0]["DP"]
prob_not_present = phred_to_prob(
record.info["PROB_ABSENT"][0]
) + phred_to_prob(record.info["PROB_ARTIFACT"][0])
+ if pd.isna(prob_not_present):
+ vaf = pd.NA
lineages = record.info["LINEAGES"]
for signature in signatures:
# generate df with all signatures + VAF and Prob_not_present from calculation
@@ -43,6 +71,7 @@ def has_numbers(inputString):
{
"Mutations": signature,
"Frequency": vaf,
+ "ReadDepth": dp,
"Prob_not_present": prob_not_present,
},
ignore_index=True,
@@ -56,27 +85,15 @@ def has_numbers(inputString):
ignore_index=True,
)
-# count occurences of signatures (x) in lineage columns and get sorted list
-lineage_dict = dict(lineage_df.count())
-lineage_dict = dict(
- sorted(lineage_dict.items(), key=lambda item: item[1], reverse=True)
-)
-top5_lineages = list(lineage_dict.keys())
-
-# only include variant names (index=0) + top 5 variants (index=1-6) and reorder
-lineage_df.drop(labels=top5_lineages[7:], axis=1, inplace=True)
-lineage_df = lineage_df[top5_lineages[:7]]
-
# aggregate both dataframes by summing up repeating rows for VAR (maximum=1) and multiply Prob_not_present
-variants_df = (
- variants_df.groupby(["Mutations"])
- .agg(
- func={"Frequency": lambda x: min(sum(x), 1.0), "Prob_not_present": np.prod},
- axis=1,
- )
- .reset_index()
+variants_df = variants_df.groupby(["Mutations"]).agg(
+ func={
+ "Frequency": lambda x: min(sum(x), 1.0),
+ "Prob_not_present": prod_prob_not_present,
+ "ReadDepth": np.min,
+ },
+ axis=1,
)
-pd.set_option("display.max_rows", None)
# new column for 1-prob_not_present = prob_present
variants_df["Probability"] = 1.0 - variants_df["Prob_not_present"]
@@ -88,35 +105,40 @@ def has_numbers(inputString):
lineage_df = lineage_df.replace({"x": 1})
lineage_df = (
lineage_df.groupby(["Mutations"])
- .agg(func={column: np.max for column in top5_lineages[:7]})
+ .agg(func={column: np.max for column in lineage_df.columns})
.reset_index(drop=True)
)
lineage_df = lineage_df.replace({1: "x", 0: ""})
-# calculate Jaccard coefficient for top 5 lineages and save row as df to append after sorting
+# calculate Jaccard coefficient for each lineage
+# iterate over lineages in columns (mutations as index)
+lineage_df.set_index("Mutations", inplace=True)
jaccard_coefficient = {}
-for lineage in range(1, len(top5_lineages[:6])):
- jaccard_coefficient[top5_lineages[lineage]] = round(
- variants_df[
- variants_df["Mutations"].isin(
- lineage_df[lineage_df[top5_lineages[lineage]] == "x"]["Mutations"]
- )
- ]["Prob X VAF"].sum()
- / variants_df["Prob X VAF"].sum(),
+for lineage in lineage_df.columns:
+ lineage_defining_variants = variants_df.index.isin(
+ lineage_df.index[lineage_df[lineage] == "x"]
+ )
+ lineage_defining_non_variants = ~lineage_defining_variants
+ jaccard_coefficient[lineage] = round(
+ (
+ variants_df[lineage_defining_variants]["Prob X VAF"].sum()
+ + variants_df[lineage_defining_non_variants]["Prob_not_present"].sum()
+ )
+ / len(variants_df),
3,
)
+
jaccard_row = pd.DataFrame(
{"Mutations": "Similarity", **jaccard_coefficient}, index=[0]
)
-
# merge variants dataframe and lineage dataframe
-variants_df = variants_df.merge(lineage_df, left_on="Mutations", right_on="Mutations")
+variants_df = variants_df.merge(lineage_df, left_index=True, right_index=True)
# add feature column for sorting
-variants_df["Features"] = variants_df["Mutations"].str.extract(r"(.+)[:].+|\*")
+variants_df["Features"] = variants_df.index.to_series().str.extract(r"(.+)[:].+|\*")
# position of variant for sorting and change type
-variants_df["Position"] = variants_df["Mutations"].str.extract(
+variants_df["Position"] = variants_df.index.to_series().str.extract(
r"(.*:?[A-Z]+|\*$|-)([0-9]+)([A-Z]+$|\*$|-)$"
)[1]
variants_df = variants_df.astype({"Position": "int64"})
@@ -128,70 +150,75 @@ def has_numbers(inputString):
sorterIndex = dict(zip(sorter, range(len(sorter))))
variants_df["Features_Rank"] = variants_df["Features"].map(sorterIndex)
-# define categories for sorting
-variants_df.loc[
- (variants_df[top5_lineages[1]] == "x") & (variants_df["Probability"] >= 0.95),
- "Order",
-] = 0
-variants_df.loc[
- (variants_df[top5_lineages[1]] == "x") & (variants_df["Probability"] <= 0.05),
- "Order",
-] = 1
-variants_df.loc[
- (variants_df[top5_lineages[1]] != "x") & (variants_df["Probability"] >= 0.95),
- "Order",
-] = 2
-variants_df.loc[
- (variants_df[top5_lineages[1]] == "x")
- & ((variants_df["Probability"] > 0.05) & (variants_df["Probability"] < 0.95)),
- "Order",
-] = 3
-variants_df.loc[
- (variants_df[top5_lineages[1]] != "x") & (variants_df["Probability"] <= 0.05),
- "Order",
-] = 4
-variants_df.loc[
- (variants_df[top5_lineages[1]] != "x")
- & ((variants_df["Probability"] > 0.05) & (variants_df["Probability"] < 0.95)),
- "Order",
-] = 5
-
-top5_lineages_row_df = pd.DataFrame(
- {"Mutations": "Lineage", **{x: x for x in top5_lineages[1:6]}}, index=[0]
-)
-
-# sort final DF
-variants_df["Prob X VAF"].replace([0, 0.0], np.NaN, inplace=True)
-variants_df.sort_values(
- by=["Order", "Features_Rank", "Position"],
- ascending=[True, True, True],
- na_position="last",
- inplace=True,
+# row for lineage name after renaming columns (column names can't be formatted)
+lineages_row_df = pd.DataFrame(
+ {
+ "Mutations": "Lineage",
+ **{x: x for x in list(lineage_df.columns) if x != "Mutations"},
+ },
+ index=[0],
)
# concat row with Jaccard coefficient, drop unneccesary columns, sort with Jaccard coefficient, round
-variants_df = pd.concat([jaccard_row, variants_df]).reset_index(drop=True)
-variants_df = pd.concat([top5_lineages_row_df, variants_df]).reset_index(drop=True)
-variants_df = variants_df[
- ["Mutations", "Probability", "Frequency", *top5_lineages[1:6]]
-]
-variants_df = variants_df.round({"Probability": 5, "Frequency": 5})
+variants_df.reset_index(inplace=True)
+variants_df = pd.concat([jaccard_row, variants_df])
+variants_df = pd.concat([lineages_row_df, variants_df])
+
+
+variants_df = variants_df.round({"Probability": 2, "Frequency": 2})
variants_df.set_index("Mutations", inplace=True)
variants_df.sort_values(
by="Similarity", axis=1, na_position="first", ascending=False, inplace=True
)
-# rename top 5 hits
+# rename hits ascending
variants_df.rename(
columns={
x: y
for x, y in zip(
- list(variants_df.columns)[2:],
- ["Highest similarity", "2nd", "3rd", "4th", "5th"],
+ list(variants_df.columns)[8:],
+ rename_enumeration(len(list(variants_df.columns)[8:])),
)
},
errors="raise",
inplace=True,
)
+# sort final DF
+variants_df.loc[
+ variants_df["1st"] == "x",
+ "Order",
+] = 1
+variants_df.loc[
+ variants_df["1st"] != "x",
+ "Order",
+] = 2
+variants_df.at["Similarity", "Order"] = 0
+variants_df.at["Lineage", "Order"] = 0
+variants_df["Prob X VAF"].replace([0, 0.0], np.NaN, inplace=True)
+variants_df.sort_values(
+ by=["Order", "Features_Rank", "Position"],
+ ascending=[True, True, True],
+ na_position="last",
+ inplace=True,
+)
+# drop unwanted columns
+variants_df.drop(
+ columns=[
+ "Prob_not_present",
+ "Prob X VAF",
+ "Features",
+ "Position",
+ "Features_Rank",
+ "Order",
+ ],
+ inplace=True,
+)
+all_columns = variants_df.columns
+first_columns = ["Probability", "Frequency", "ReadDepth"]
+rest_columns = [item for item in all_columns if item not in first_columns]
+variants_df = variants_df[[*first_columns, *rest_columns]]
+
+# drop other lineages, top 10 only
+variants_df.drop(variants_df.columns[13:], axis=1, inplace=True)
# output variant_df
variants_df.to_csv(snakemake.output.variant_table, index=True, sep=",")