Skip to content

Commit

Permalink
Identifying indels in flanking region for PowerSeq sequences (#70)
Browse files Browse the repository at this point in the history
  • Loading branch information
rnmitchell authored Feb 6, 2024
1 parent 03aa9e2 commit d55a2b2
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 3 deletions.
4 changes: 4 additions & 0 deletions lusSTR/tests/data/test_indels.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Locus,Total_Reads,Sequence,SampleID,Project,Analysis
PentaD,44,GAGCCATGATCACACCACTACACTCCAGCCTAGGTGACAGAGCAAGACACCATCTCAAGAAAGAAAAAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAAACGAAGGGGAAAAAAAGAGAATCATAAACATAAATGTAAAATTTCTCAAAAAAATCGTTA,A01,test,test
vWA,36,GATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAATCCAAGTCACATACTGATTATTCTTATCATCCACTAGGGCTTTCACATCTCAGCCAAGTCAACTTGGATCCTCTAGACCTGTTTCTTCTTCTGGAA,A01,test,test
D7S820,109,AGAATTGCACCACATATTGGTAATTAAATGTTTACTATAGACTATTTAGTGAGATAAAAAAAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCGTTAGTTCGTTCTAAACTATGACAAGTGTTCTATCATACCCTTTATATATATTAACCTTAAAATAACTC,A01,test,test
4 changes: 4 additions & 0 deletions lusSTR/tests/data/test_indels.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SampleID Project Analysis Locus UAS_Output_Sequence Forward_Strand_Sequence UAS_Output_Bracketed_Notation Forward_Strand_Bracketed_Notation CE_Allele LUS LUS_Plus Reads
A01 test test VWA TCTATCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGA TCTA [TCTG]4 [TCTA]8 TCCA TCTA TAGA TGGA [TAGA]8 [CAGA]4 TAGA 13 13_8 13_8_4_1 36
A01 test test PENTA D AAAAGAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGA AAAAG [AAAGA]5 AAAAG [AAAGA]5 5 5_5 5_5 44
A01 test test D7S820 GATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATC [GATA]8 GACA GATT GATA GTTT AAAC TATC AATC TGTC [TATC]8 8 8_8 8_8_1_0 109
23 changes: 23 additions & 0 deletions lusSTR/tests/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,3 +357,26 @@ def test_genemarker(tmp_path):
print(exp_output)
obs_output = str(tmp_path / f"genemarker_test{ext}")
assert filecmp.cmp(exp_output, obs_output) is True


def test_deletions(tmp_path):
input = data_file("test_indels.csv")
exp_output = data_file("test_indels.txt")
obs_output = str(tmp_path / "test_indels.txt")
arglist = [
"config",
"-w",
str(tmp_path),
"--out",
"test_indels",
"--input",
str(input),
"--analysis-software",
"genemarker",
"--powerseq",
]
lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist))
shutil.copyfile(input, os.path.join(str(tmp_path), "test_indels.csv"))
snakemake_arglist = ["strs", "convert", "-w", str(tmp_path)]
lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(snakemake_arglist))
assert filecmp.cmp(exp_output, obs_output) is True
45 changes: 42 additions & 3 deletions lusSTR/wrappers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from lusSTR.scripts.repeat import sequence_to_bracketed_form, split_by_n
from lusSTR.scripts.repeat import reverse_complement, reverse_complement_bracketed
from pathlib import Path
from warnings import warn


with open(get_str_metadata_file(), "r") as fh:
Expand Down Expand Up @@ -107,10 +108,17 @@ def format_table(input, software, kit="forenseq"):
"Multiple microvariants identified at D12 locus. "
"Please check STRait Razor version!!"
)
print(msg)
warn(msg)
indel_flag = marker.indel_flag
if indel_flag == "Possible indel or partial sequence":
if locus == "PENTA D" and kit == "powerseq":
marker = check_pentad(marker, sequence, software)
elif locus == "D7S820" and kit == "powerseq":
marker = check_D7(marker, sequence, software)
elif locus == "VWA" and kit == "powerseq":
marker = check_vwa(marker, sequence, software)
summary = [sampleid, project, analysis, locus] + marker.summary + [reads]
list_of_lists.append(summary)

if software != "uas":
flank_summary = [
sampleid,
Expand All @@ -123,7 +131,7 @@ def format_table(input, software, kit="forenseq"):
marker.flank_5p,
marker.convert,
marker.flank_3p,
marker.indel_flag,
indel_flag,
]
flanks_list.append(flank_summary)

Expand Down Expand Up @@ -168,6 +176,37 @@ def format_table(input, software, kit="forenseq"):
return final_output, final_flank_output, columns


def check_pentad(marker, sequence, software):
new_marker = marker
if marker.summary[1][:4] == "AAAG" and marker.flank_5p[-5:] == "GAAAA":
new_sequence = f"{sequence[:66]}-{sequence[66:]}"
new_marker = STRMarkerObject("PENTA D", new_sequence, software, kit="powerseq")
elif marker.summary[1][-4:] == "AAAG" and marker.flank_3p[:7] == "AAAAA A":
new_sequence = f"{sequence}-"
new_marker = STRMarkerObject("PENTA D", new_sequence, software, kit="powerseq")
else:
return marker
return new_marker


def check_D7(marker, sequence, software):
if marker.summary[1][:3] == "AAC" and marker.flank_5p[-8:] == "T AAAAAA":
new_sequence = f"{sequence[:60]}-{sequence[60:]}"
new_marker = STRMarkerObject("D7S820", new_sequence, software, kit="powerseq")
else:
return marker
return new_marker


def check_vwa(marker, sequence, software):
if sequence[:4] == "GATA":
new_sequence = f"{sequence[:1]}-{sequence[1:]}"
new_marker = STRMarkerObject("VWA", new_sequence, software, kit="powerseq")
else:
return marker
return new_marker


def combine_reads(table, columns):
comb_table = table.groupby(columns[:-1], as_index=False)["Reads"].sum()
sorted = sort_table(comb_table)
Expand Down

0 comments on commit d55a2b2

Please sign in to comment.