Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move marker plot generation to filter steb for STRs #69

Merged
merged 10 commits into from
Jan 12, 2024
Merged
2 changes: 0 additions & 2 deletions lusSTR/scripts/filter_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,4 @@ def same_size_filter(df, metadata, datatype):
else:
final_df = pd.concat([final_df, df_filt])
final_df = final_df.reset_index(drop=True)
if datatype == "lusplus":
final_df = final_df.drop("CE_Allele", axis=1)
Comment on lines -534 to -535
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ran into a problem with this code because the marker plots use the CE allele for plotting- removing it so it can actually create the plots.

return final_df
16 changes: 11 additions & 5 deletions lusSTR/scripts/marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,12 +388,18 @@ def convert(self):
else:
if "GGGCTGCCTA" in self.uas_sequence:
break_point = self.uas_sequence.index("GGGCTGCCTA") + 10
else:
bracketed_form = (
f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} "
f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}"
)
elif "TTTT" in self.uas_sequence:
break_point = self.uas_sequence.index("TTTT") + 14
bracketed_form = (
f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} "
f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}"
)
bracketed_form = (
f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} "
f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}"
)
else:
bracketed_form = ""
Comment on lines -391 to +402
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once again ran into an issue with this marker- this time, the sequence seemed to be off by 10s of bases on one side (e.g. the sequence started ~50 bases early on the 5' side thus not containing the entire repeat region and was missing the GGGCTGCCTA and the TTTT sequences the code originally searched for). Now the code just basically drops these sequences- they are garbage.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rebecca emphatically declares that these sequences cannot be redeemed.

return bracketed_form


Expand Down
19 changes: 10 additions & 9 deletions lusSTR/scripts/repeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,15 +163,16 @@ def repeat_copy_number(bf, repeat):
The input is a sequence string collapsed to bracketed sequence form.
"""
longest = 0
for block in bf.split(" "):
if block == repeat:
if 1 > longest:
longest = 1
match = re.match(r"\[" + repeat + r"\](\d+)", block)
if match:
length = int(match.group(1))
if length > longest:
longest = length
if bf != "":
for block in bf.split(" "):
if block == repeat:
if 1 > longest:
longest = 1
match = re.match(r"\[" + repeat + r"\](\d+)", block)
if match:
length = int(match.group(1))
if length > longest:
longest = length
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code is dropping these garbage sequences.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a simple and straightforward change, but I have a suggestion for improvement: instead of indenting the code in a conditional block, short-circuit the inverse condition.

if bf == "":
    return 0
for block in bf.split(" "):
   ### blah blah blah

This has a few benefits. First of all, you see what happens right away if bf is empty: in the current code, you have to read all the way to the end of the block to find out that...nothing happens. Second, it keeps the nesting and indentation to a minimum: this is not only good for legibility, it also (at least in this case) makes for cleaner diffs in code review. My suggestion above would result in a simple two-line diff: the diff for the current code looks like this.

Screenshot 2024-01-11 at 9 39 33 PM

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense! I'll update.

return str(longest)


Expand Down
48 changes: 24 additions & 24 deletions lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
SampleID,Locus,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter,CE_Allele
Sample1,D4S2408,10_10_0,1022,real_allele,,,,,,,
Sample1,D4S2408,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,,,
Sample1,D4S2408,8_8_0,1050,real_allele,,,,,,,
Sample1,D8S1179,14_12_1_0,869,real_allele,,,,,,,
Sample1,D8S1179,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212,
Sample1,D8S1179,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201,
Sample1,D9S1122,13_11,948,real_allele,,,,,,,
Sample1,D9S1122,12_10,108,-1_stutter,13_11,,948.0,,,0.114,
Sample1,D9S1122,11_11,991,real_allele,,,,,,,
Sample1,D9S1122,10_10,87,-1_stutter,11_11,,991.0,,,0.088,
Sample1,FGA,23_15_3_0,1436,real_allele,,,,,,,
Sample1,FGA,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182,
Sample1,FGA,21_13_3_0,48,BelowAT,,,,,0.013,,
Sample1,FGA,20_12_3_0,1750,real_allele,,,,,,,
Sample1,FGA,18_10_3_0,181,real_allele,,,,,,,
Sample1,FGA,17_9_3_0,15,BelowAT,,,,,0.004,,
Sample1,PENTA D,15_15,50,real_allele,,,,,,,
Sample1,PENTA D,13_13,1000,real_allele,,,,,,,
Sample1,PENTA E,7_7,505,real_allele,,,,,,,7.0
Sample1,TH01,7_7,2197,real_allele,,,,,,,
Sample1,TH01,6_6,1632,real_allele,,,,,,,
Sample1,TH01,5_5,66,BelowAT,,,,,0.017,,
Sample1,TPOX,11_11,15,BelowAT,,,,,1.0,,11.0
SampleID,Locus,CE_Allele,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter
Sample1,D4S2408,10.0,10_10_0,1022,real_allele,,,,,,
Sample1,D4S2408,9.0,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,,
Sample1,D4S2408,8.0,8_8_0,1050,real_allele,,,,,,
Sample1,D8S1179,14.0,14_12_1_0,869,real_allele,,,,,,
Sample1,D8S1179,13.0,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212
Sample1,D8S1179,12.0,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201
Sample1,D9S1122,13.0,13_11,948,real_allele,,,,,,
Sample1,D9S1122,12.0,12_10,108,-1_stutter,13_11,,948.0,,,0.114
Sample1,D9S1122,11.0,11_11,991,real_allele,,,,,,
Sample1,D9S1122,10.0,10_10,87,-1_stutter,11_11,,991.0,,,0.088
Sample1,FGA,23.0,23_15_3_0,1436,real_allele,,,,,,
Sample1,FGA,22.0,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182
Sample1,FGA,21.0,21_13_3_0,48,BelowAT,,,,,0.013,
Sample1,FGA,20.0,20_12_3_0,1750,real_allele,,,,,,
Sample1,FGA,18.0,18_10_3_0,181,real_allele,,,,,,
Sample1,FGA,17.0,17_9_3_0,15,BelowAT,,,,,0.004,
Sample1,PENTA D,15.0,15_15,50,real_allele,,,,,,
Sample1,PENTA D,13.0,13_13,1000,real_allele,,,,,,
Sample1,PENTA E,7.0,7_7,505,real_allele,,,,,,
Sample1,TH01,7.0,7_7,2197,real_allele,,,,,,
Sample1,TH01,6.0,6_6,1632,real_allele,,,,,,
Sample1,TH01,5.0,5_5,66,BelowAT,,,,,0.017,
Sample1,TPOX,11.0,11_11,15,BelowAT,,,,,1.0,
54 changes: 27 additions & 27 deletions lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
Locus,CE Allele,Allele Seq,Reads
D4S2408,8,ATCTATCTATCTATCTATCTATCTATCTATCT,1000
D4S2408,9,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357
D4S2408,10,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900
D8S1179,12,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26
D8S1179,12,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11
D8S1179,13,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95
D8S1179,13,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89
D8S1179,14,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739
D8S1179,14,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130
D9S1122,10,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87
D9S1122,11,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991
D9S1122,12,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108
D9S1122,13,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948
FGA,17,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15
FGA,18,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181
FGA,20,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750
FGA,21,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48
FGA,22,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262
FGA,23,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436
PentaD,13,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000
PentaD,15,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50
PentaE,7,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505
TH01,5,AATGAATGAATGAATGAATG,66
TH01,6,AATGAATGAATGAATGAATGAATG,1632
TH01,7,AATGAATGAATGAATGAATGAATGAATG,2197
TPOX,11,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15
vWA,16,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6
D4S2408,8.0,ATCTATCTATCTATCTATCTATCTATCTATCT,1000
D4S2408,9.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357
D4S2408,10.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900
D8S1179,12.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26
D8S1179,12.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11
D8S1179,13.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95
D8S1179,13.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89
D8S1179,14.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739
D8S1179,14.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130
D9S1122,10.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87
D9S1122,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991
D9S1122,12.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108
D9S1122,13.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948
FGA,17.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15
FGA,18.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181
FGA,20.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750
FGA,21.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48
FGA,22.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262
FGA,23.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436
PentaD,13.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000
PentaD,15.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50
PentaE,7.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505
TH01,5.0,AATGAATGAATGAATGAATG,66
TH01,6.0,AATGAATGAATGAATGAATGAATG,1632
TH01,7.0,AATGAATGAATGAATGAATGAATGAATG,2197
TPOX,11.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15
vWA,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6
15 changes: 3 additions & 12 deletions lusSTR/tests/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,24 +323,15 @@ def test_snakemake(command, output, format_out, convert_out, all_out, tmp_path):
assert filecmp.cmp(exp_output, obs_output) is True


@pytest.mark.parametrize(
"sex",
[True, False],
)
def test_marker_plots(sex, tmp_path):
def test_marker_plots(tmp_path):
inputfile = data_file("UAS_bulk_input/Positive Control Sample Details Report 2315.xlsx")
exp_output = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_marker_plots.pdf")
sex_exp = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_sexchr_marker_plots.pdf")
if sex:
arglist = ["config", "-w", str(tmp_path), "--input", str(inputfile), "--sex"]
else:
arglist = ["config", "-w", str(tmp_path), "--input", str(inputfile)]
arglist = ["config", "-w", str(tmp_path), "--input", str(inputfile)]
lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist))
snakemake_arglist = ["strs", "convert", "-w", str(tmp_path)]
snakemake_arglist = ["strs", "all", "-w", str(tmp_path)]
lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(snakemake_arglist))
assert os.path.exists(exp_output) is True
if sex:
assert os.path.exists(sex_exp) is True


def test_genemarker(tmp_path):
Expand Down
50 changes: 1 addition & 49 deletions lusSTR/wrappers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@

import csv
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
Expand All @@ -23,7 +21,6 @@
from lusSTR.scripts.repeat import collapse_all_repeats, collapse_repeats_by_length
from lusSTR.scripts.repeat import sequence_to_bracketed_form, split_by_n
from lusSTR.scripts.repeat import reverse_complement, reverse_complement_bracketed
from matplotlib.backends.backend_pdf import PdfPages
from pathlib import Path


Expand Down Expand Up @@ -101,9 +98,8 @@ def format_table(input, software, kit="forenseq"):
]
flanks_list.append(flank_summary)
continue

marker = STRMarkerObject(locus, sequence, software, kit=kit)
if locus == "D12S391" and kit == "powerseq":
if locus == "D12S391" and kit == "powerseq" and software == "straitrazor":
if "." in str(marker.canonical):
check_sr += 1
if check_sr > 10:
Comment on lines -106 to 105
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking through the code I realized if I was going to be running a check on strait razor data I should make sure that strait razor was actually used (and not gene marker).

Expand Down Expand Up @@ -185,48 +181,6 @@ def sort_table(table):
return sorted_table


def marker_plots(df, output_name, sex=False):
Path("MarkerPlots").mkdir(parents=True, exist_ok=True)
df["CE_Allele"] = df["CE_Allele"].astype(float)
for id in df["SampleID"].unique():
sample_id = f"{id}_sexchr" if sex else id
with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf:
make_plot(df, id, sex, sameyaxis=False)
pdf.savefig()
make_plot(df, id, sex)
pdf.savefig()


def make_plot(df, id, sex=False, sameyaxis=True):
sample_df = df[df["SampleID"] == id]
sample_id = f"{id}_sexchr" if sex else id
max_reads = max(sample_df["Reads"])
n = 100 if max_reads > 1000 else 10
max_yvalue = int(math.ceil(max_reads / n)) * n
increase_value = int(math.ceil((max_yvalue / 5)) / n) * n
fig = plt.figure(figsize=(31, 31)) if sex is True else plt.figure(figsize=(30, 30))
n = 0
for marker in sample_df["Locus"].unique():
n += 1
marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele")
ax = fig.add_subplot(6, 6, n) if sex is True else fig.add_subplot(6, 5, n)
ax.bar(marker_df["CE_Allele"], marker_df["Reads"])
if sameyaxis:
ax.set_yticks(np.arange(0, max_yvalue, increase_value))
ax.set_xticks(
np.arange(min(marker_df["CE_Allele"]) - 1, max(marker_df["CE_Allele"]) + 2, 1.0)
)
ax.title.set_text(marker)
if sameyaxis:
plt.text(
0.4, 0.95, "Marker Plots With Same Y-Axis Scale", transform=fig.transFigure, size=24
)
else:
plt.text(
0.4, 0.95, "Marker Plots With Custom Y-Axis Scale", transform=fig.transFigure, size=24
)


def main(input, out, kit, software, sex, nocombine):
input = str(input)
out = str(out)
Expand All @@ -248,7 +202,6 @@ def main(input, out, kit, software, sex, nocombine):
sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False)
else:
sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False)
marker_plots(sex_final_table, output_name, sex=True)
if software != "uas":
if not autosomal_final_table.empty:
autosomal_flank_table.to_csv(f"{output_name}_flanks.txt", sep="\t", index=False)
Expand All @@ -260,7 +213,6 @@ def main(input, out, kit, software, sex, nocombine):
autosomal_final_table.to_csv(out, sep="\t", index=False)
else:
autosomal_final_table.to_csv(out, sep="\t", index=False)
marker_plots(autosomal_final_table, output_name)


if __name__ == "__main__":
Expand Down
Loading
Loading