From af86c4b8e4d50936d35190fab8de8aab896c1e3c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 16:23:52 -0700 Subject: [PATCH 1/4] remove nucToNumber in GetCanonicalMotif --- trtools/utils/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/trtools/utils/utils.py b/trtools/utils/utils.py index f7328f81..f5a02284 100644 --- a/trtools/utils/utils.py +++ b/trtools/utils/utils.py @@ -387,9 +387,9 @@ def GetCanonicalMotif(repseq): repseq_r = GetCanonicalOneStrand(ReverseComplement(repseq)) # choose first seq alphabetically for i in range(len(repseq_f)): - if nucToNumber[repseq_f[i]] < nucToNumber[repseq_r[i]]: + if repseq_f[i] < repseq_r[i]: return repseq_f - if nucToNumber[repseq_r[i]] < nucToNumber[repseq_f[i]]: + if repseq_r[i] < repseq_f[i]: return repseq_r return repseq_f @@ -420,9 +420,9 @@ def GetCanonicalOneStrand(repseq): for i in range(size): newseq = repseq[size-i:]+repseq[0:size-i] for j in range(size): - if nucToNumber[newseq[j]] < nucToNumber[canonical[j]]: + if newseq[j] < canonical[j]: canonical = newseq - elif nucToNumber[newseq[j]] > nucToNumber[canonical[j]]: + elif newseq[j] > canonical[j]: break return canonical From 66bf81f8271da2eced2a088f59b2c750eae1ebd2 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 23:39:57 +0000 Subject: [PATCH 2/4] remove nucToNumber and add tests for IUPAC codes --- trtools/utils/tests/test_utils.py | 2 ++ trtools/utils/utils.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/trtools/utils/tests/test_utils.py b/trtools/utils/tests/test_utils.py index 12097042..ec0c4d37 100644 --- a/trtools/utils/tests/test_utils.py +++ b/trtools/utils/tests/test_utils.py @@ -115,6 +115,8 @@ def test_GetCanonicalMotif(): assert(utils.GetCanonicalMotif("TTGTT")=="AAAAC") assert(utils.GetCanonicalMotif("")=="") assert(utils.GetCanonicalMotif("cag")=="AGC") + assert(utils.GetCanonicalMotif("AARRG")=="AARRG") + assert(utils.GetCanonicalMotif("YARRG")=="ARRGY") # GetCanonicalOneStrand def test_GetCanonicalOneStrand(): diff --git a/trtools/utils/utils.py b/trtools/utils/utils.py index f5a02284..602845f3 100644 --- a/trtools/utils/utils.py +++ b/trtools/utils/utils.py @@ -14,7 +14,6 @@ import trtools.utils.common as common # pragma: no cover -nucToNumber={"A":0,"C":1,"G":2,"T":3} def LoadSingleReader( vcf_loc: str, From c1f78eeb73393a9642398911b9c585496b173af4 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 2 Nov 2024 05:23:01 +0000 Subject: [PATCH 3/4] also handle iupac codes in ReverseComplement() --- trtools/utils/tests/test_utils.py | 12 ++++++++++++ trtools/utils/utils.py | 24 +++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/trtools/utils/tests/test_utils.py b/trtools/utils/tests/test_utils.py index ec0c4d37..9ae4f39a 100644 --- a/trtools/utils/tests/test_utils.py +++ b/trtools/utils/tests/test_utils.py @@ -128,6 +128,13 @@ def test_GetCanonicalOneStrand(): assert(utils.GetCanonicalOneStrand("TTGTT")=="GTTTT") assert(utils.GetCanonicalOneStrand("")=="") assert(utils.GetCanonicalOneStrand("at")=="AT") + # Additional tests with IUPAC codes + assert(utils.GetCanonicalOneStrand("RY")=="RY") + assert(utils.GetCanonicalOneStrand("YR")=="RY") + assert(utils.GetCanonicalOneStrand("SW")=="SW") + assert(utils.GetCanonicalOneStrand("WS")=="SW") + assert(utils.GetCanonicalOneStrand("KM")=="KM") + assert(utils.GetCanonicalOneStrand("MK")=="KM") # ReverseComplement def test_ReverseComplement(): @@ -135,6 +142,11 @@ def test_ReverseComplement(): assert(utils.ReverseComplement("")=="") assert(utils.ReverseComplement("CGNT")=="ANCG") assert(utils.ReverseComplement("ccga")=="TCGG") + # additional tests with IUPAC codes + assert(utils.ReverseComplement("RYASWKM")=="KMWSTRY") + # also test the characters that don't change + assert(utils.ReverseComplement("BDHV")=="BDHV") + assert(utils.ReverseComplement("N")=="N") # InferRepeatSequence def test_InferRepeatSequence(): diff --git a/trtools/utils/utils.py b/trtools/utils/utils.py index 602845f3..71ef86af 100644 --- a/trtools/utils/utils.py +++ b/trtools/utils/utils.py @@ -428,7 +428,7 @@ def GetCanonicalOneStrand(repseq): def ReverseComplement(seq): r"""Get reverse complement of a sequence. - Converts everything to uppsercase. + Converts everything to uppercase and handles IUPAC codes. Parameters ---------- @@ -444,21 +444,19 @@ def ReverseComplement(seq): -------- >>> ReverseComplement("AGGCT") 'AGCCT' + >>> ReverseComplement("AGGCTRY") + 'RAGCCT' """ + iupac_complement = { + 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', + 'R': 'Y', 'Y': 'R', 'S': 'S', 'W': 'W', + 'K': 'M', 'M': 'K', 'B': 'V', 'D': 'H', + 'H': 'D', 'V': 'B', 'N': 'N' + } seq = seq.upper() newseq = "" - size = len(seq) - for i in range(len(seq)): - char = seq[len(seq)-i-1] - if char == "A": - newseq += "T" - elif char == "G": - newseq += "C" - elif char == "C": - newseq += "G" - elif char == "T": - newseq += "A" - else: newseq += "N" + for char in reversed(seq): + newseq += iupac_complement.get(char, 'N') return newseq def InferRepeatSequence(seq, period): From 916bdc0737b9b04b611ded32dea05bc444bf36c8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 1 Nov 2024 22:32:59 -0700 Subject: [PATCH 4/4] update test case in docstring of ReverseComplement --- trtools/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trtools/utils/utils.py b/trtools/utils/utils.py index 71ef86af..2ec249d4 100644 --- a/trtools/utils/utils.py +++ b/trtools/utils/utils.py @@ -445,7 +445,7 @@ def ReverseComplement(seq): >>> ReverseComplement("AGGCT") 'AGCCT' >>> ReverseComplement("AGGCTRY") - 'RAGCCT' + 'RYAGCCT' """ iupac_complement = { 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A',