From 1f46215ae7054c4da088c638ad82e41dd0dc7227 Mon Sep 17 00:00:00 2001
From: akikuno <akikuno@users.noreply.github.com>
Date: Fri, 12 Jul 2024 17:07:38 +0900
Subject: [PATCH] Update `consensus.call_consensus`: For mutations determined
 to be sequence errors, we previously replaced them with unknown (`N`), but
 this `N` had low interpretability. Therefore, mutations that DAJIN2
 determines to be sequence errors will now be assigned the same base as the
 reference genome.

---
 src/DAJIN2/core/consensus/consensus.py | 37 +++++++++-----------------
 src/DAJIN2/core/core.py                |  2 +-
 tests/src/consensus/test_consensus.py  | 19 +++----------
 3 files changed, 17 insertions(+), 41 deletions(-)

diff --git a/src/DAJIN2/core/consensus/consensus.py b/src/DAJIN2/core/consensus/consensus.py
index 7ff65e7..cea2fca 100644
--- a/src/DAJIN2/core/consensus/consensus.py
+++ b/src/DAJIN2/core/consensus/consensus.py
@@ -13,7 +13,9 @@
 ###########################################################
 
 
-def convert_to_percentage(cssplits: list[list[str]], mutation_loci: list[set[str]]) -> list[dict[str, float]]:
+def convert_to_percentage(
+    cssplits: list[list[str]], mutation_loci: list[set[str]], sequence: str
+) -> list[dict[str, float]]:
     """
     Convert sequences and mutations into percentages, annotating sequence errors as "SEQERROR".
     """
@@ -22,13 +24,13 @@ def convert_to_percentage(cssplits: list[list[str]], mutation_loci: list[set[str
     coverage = len(cssplits_transposed[0])
 
     cons_percentage = []
-    for cs_transposed, mut_loci in zip(cssplits_transposed, mutation_loci):
+    for cs_transposed, mut_loci, nucleotide in zip(cssplits_transposed, mutation_loci, sequence):
         count_cs = defaultdict(float)
         for cs in cs_transposed:
             operator = cs[0]
             if operator in {"+", "-", "*"}:
                 if operator not in mut_loci:
-                    cs = "SEQERROR"
+                    cs = f"={nucleotide.upper()}"
             count_cs[cs] += 1 / coverage * 100
         cons_percentage.append(dict(count_cs))
 
@@ -44,23 +46,6 @@ def remove_all_n(cons_percentage: list[dict[str, float]]) -> list[dict[str, floa
     return cons_percentage
 
 
-def replace_sequence_error(cons_percentage: list[dict[str, float]]) -> list[dict[str, float]]:
-    """
-    Replace sequence error as distributing according to proportion of cs tags
-    If a dictionary contains only "SEQERROR", it is replaced with {"N": 100}. Otherwise, "SEQERROR" is removed.
-    """
-    cons_percentage_replaced = []
-    for cons_per in cons_percentage:
-        # Replace a dictionary containing only "SEQERROR" with {"N": 100}
-        if len(cons_per) == 1 and "SEQERROR" in cons_per:
-            cons_percentage_replaced.append({"N": 100})
-            continue
-        cons_per.pop("SEQERROR", None)
-        cons_percentage_replaced.append(cons_per)
-
-    return cons_percentage_replaced
-
-
 def adjust_to_100_percent(cons_percentage: list[dict[str, float]]) -> list[dict[str, float]]:
     adjusted_percentages = []
 
@@ -74,13 +59,12 @@ def adjust_to_100_percent(cons_percentage: list[dict[str, float]]) -> list[dict[
     return adjusted_percentages
 
 
-def call_percentage(cssplits: list[list[str]], mutation_loci: list[set[str]]) -> list[dict[str, float]]:
+def call_percentage(cssplits: list[list[str]], mutation_loci: list[set[str]], sequence: str) -> list[dict[str, float]]:
     """call position weight matrix in defferent loci.
     - non defferent loci are annotated to "Match" or "Unknown(N)"
     """
-    cons_percentage = convert_to_percentage(cssplits, mutation_loci)
+    cons_percentage = convert_to_percentage(cssplits, mutation_loci, sequence)
     cons_percentage = remove_all_n(cons_percentage)
-    cons_percentage = replace_sequence_error(cons_percentage)
     return adjust_to_100_percent(cons_percentage)
 
 
@@ -96,7 +80,9 @@ class ConsensusKey:
     percent: float
 
 
-def call_consensus(tempdir: Path, sample_name: str, clust_sample: list[dict]) -> tuple[dict[list], dict[str]]:
+def call_consensus(
+    tempdir: Path, sample_name: str, fasta_alleles: dict[str, str], clust_sample: list[dict]
+) -> tuple[dict[list], dict[str]]:
     clust_sample.sort(key=lambda x: [x["ALLELE"], x["LABEL"]])
 
     cons_percentages = {}
@@ -104,12 +90,13 @@ def call_consensus(tempdir: Path, sample_name: str, clust_sample: list[dict]) ->
 
     for (allele, label), group in groupby(clust_sample, key=lambda x: [x["ALLELE"], x["LABEL"]]):
         clust = list(group)
+        sequence = fasta_alleles[allele]
 
         path_consensus = Path(tempdir, sample_name, "consensus", allele, str(label))
         cons_mutation_loci = io.load_pickle(Path(path_consensus, "mutation_loci.pickle"))
 
         cssplits = [cs["CSSPLIT"].split(",") for cs in clust]
-        cons_percentage = call_percentage(cssplits, cons_mutation_loci)
+        cons_percentage = call_percentage(cssplits, cons_mutation_loci, sequence)
 
         key = ConsensusKey(allele, label, clust[0]["PERCENT"])
         cons_percentages[key] = cons_percentage
diff --git a/src/DAJIN2/core/core.py b/src/DAJIN2/core/core.py
index 892f2f9..93f210d 100644
--- a/src/DAJIN2/core/core.py
+++ b/src/DAJIN2/core/core.py
@@ -189,7 +189,7 @@ def execute_sample(arguments: dict):
 
     consensus.cache_mutation_loci(ARGS, clust_downsampled)
 
-    cons_percentage, cons_sequence = consensus.call_consensus(ARGS.tempdir, ARGS.sample_name, clust_downsampled)
+    cons_percentage, cons_sequence = consensus.call_consensus(ARGS.tempdir, ARGS.sample_name, ARGS.fasta_alleles, clust_downsampled)
 
     allele_names = consensus.call_allele_name(cons_sequence, cons_percentage, ARGS.fasta_alleles)
     cons_percentage = consensus.update_key_by_allele_name(cons_percentage, allele_names)
diff --git a/tests/src/consensus/test_consensus.py b/tests/src/consensus/test_consensus.py
index 9f49a8e..354cac4 100644
--- a/tests/src/consensus/test_consensus.py
+++ b/tests/src/consensus/test_consensus.py
@@ -1,20 +1,8 @@
 from src.DAJIN2.core.consensus.consensus import (
     adjust_to_100_percent,
     call_percentage,
-    replace_sequence_error,
 )
 
-###########################################################
-# replace_sequence
-###########################################################
-
-
-def test_replace_sequence_error():
-    cons_percentage = [{"A": 25, "C": 25, "SEQERROR": 50}, {"SEQERROR": 100}]
-    expected_output = [{"A": 25, "C": 25}, {"N": 100}]
-    assert replace_sequence_error(cons_percentage) == expected_output
-
-
 ###########################################################
 # adjust_to_100_percent
 ###########################################################
@@ -38,13 +26,14 @@ def test_adjust_to_100_percent_float():
 
 
 def test_call_percentage():
-    cssplits = [["+A|=C", "-T", "=C", "=A", "=T"], ["-C", "=T", "=C", "*AT", "*AT"]]
+    cssplits = [["+A|=C", "-T", "=C", "=A", "=T"], ["-C", "=T", "=C", "*AT", "*TA"]]
     mutation_loci = [{"+", "-"}, {"-"}, {}, {}, {"*"}]
+    sequence = "CTCAT"
     expected_output = [
         {"+A|=C": 50.0, "-C": 50.0},
         {"-T": 50.0, "=T": 50.0},
         {"=C": 100.0},
         {"=A": 100.0},
-        {"=T": 50.0, "*AT": 50.0},
+        {"=T": 50.0, "*TA": 50.0},
     ]
-    assert call_percentage(cssplits, mutation_loci) == expected_output
+    assert call_percentage(cssplits, mutation_loci, sequence) == expected_output