-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Move marker plot generation to filter steb for STRs #69
Changes from 7 commits
506690e
2a6fd83
504c142
677f2c5
fdc0086
f0873dc
3a24edf
922a539
e534317
3d478ac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -388,12 +388,18 @@ def convert(self): | |
else: | ||
if "GGGCTGCCTA" in self.uas_sequence: | ||
break_point = self.uas_sequence.index("GGGCTGCCTA") + 10 | ||
else: | ||
bracketed_form = ( | ||
f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} " | ||
f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}" | ||
) | ||
elif "TTTT" in self.uas_sequence: | ||
break_point = self.uas_sequence.index("TTTT") + 14 | ||
bracketed_form = ( | ||
f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} " | ||
f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}" | ||
) | ||
bracketed_form = ( | ||
f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} " | ||
f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}" | ||
) | ||
else: | ||
bracketed_form = "" | ||
Comment on lines
-391
to
+402
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Once again ran into an issue with this marker- this time, the sequence seemed to be off by 10s of bases on one side (e.g. the sequence started ~50 bases early on the 5' side thus not containing the entire repeat region and was missing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rebecca emphatically declares that these sequences cannot be redeemed. |
||
return bracketed_form | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -163,15 +163,16 @@ def repeat_copy_number(bf, repeat): | |
The input is a sequence string collapsed to bracketed sequence form. | ||
""" | ||
longest = 0 | ||
for block in bf.split(" "): | ||
if block == repeat: | ||
if 1 > longest: | ||
longest = 1 | ||
match = re.match(r"\[" + repeat + r"\](\d+)", block) | ||
if match: | ||
length = int(match.group(1)) | ||
if length > longest: | ||
longest = length | ||
if bf != "": | ||
for block in bf.split(" "): | ||
if block == repeat: | ||
if 1 > longest: | ||
longest = 1 | ||
match = re.match(r"\[" + repeat + r"\](\d+)", block) | ||
if match: | ||
length = int(match.group(1)) | ||
if length > longest: | ||
longest = length | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Code is dropping these garbage sequences. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a simple and straightforward change, but I have a suggestion for improvement: instead of indenting the code in a conditional block, short-circuit the inverse condition. if bf == "":
return 0
for block in bf.split(" "):
### blah blah blah This has a few benefits. First of all, you see what happens right away if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense! I'll update. |
||
return str(longest) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,24 @@ | ||
SampleID,Locus,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter,CE_Allele | ||
Sample1,D4S2408,10_10_0,1022,real_allele,,,,,,, | ||
Sample1,D4S2408,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,,, | ||
Sample1,D4S2408,8_8_0,1050,real_allele,,,,,,, | ||
Sample1,D8S1179,14_12_1_0,869,real_allele,,,,,,, | ||
Sample1,D8S1179,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212, | ||
Sample1,D8S1179,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201, | ||
Sample1,D9S1122,13_11,948,real_allele,,,,,,, | ||
Sample1,D9S1122,12_10,108,-1_stutter,13_11,,948.0,,,0.114, | ||
Sample1,D9S1122,11_11,991,real_allele,,,,,,, | ||
Sample1,D9S1122,10_10,87,-1_stutter,11_11,,991.0,,,0.088, | ||
Sample1,FGA,23_15_3_0,1436,real_allele,,,,,,, | ||
Sample1,FGA,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182, | ||
Sample1,FGA,21_13_3_0,48,BelowAT,,,,,0.013,, | ||
Sample1,FGA,20_12_3_0,1750,real_allele,,,,,,, | ||
Sample1,FGA,18_10_3_0,181,real_allele,,,,,,, | ||
Sample1,FGA,17_9_3_0,15,BelowAT,,,,,0.004,, | ||
Sample1,PENTA D,15_15,50,real_allele,,,,,,, | ||
Sample1,PENTA D,13_13,1000,real_allele,,,,,,, | ||
Sample1,PENTA E,7_7,505,real_allele,,,,,,,7.0 | ||
Sample1,TH01,7_7,2197,real_allele,,,,,,, | ||
Sample1,TH01,6_6,1632,real_allele,,,,,,, | ||
Sample1,TH01,5_5,66,BelowAT,,,,,0.017,, | ||
Sample1,TPOX,11_11,15,BelowAT,,,,,1.0,,11.0 | ||
SampleID,Locus,CE_Allele,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter | ||
Sample1,D4S2408,10.0,10_10_0,1022,real_allele,,,,,, | ||
Sample1,D4S2408,9.0,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,, | ||
Sample1,D4S2408,8.0,8_8_0,1050,real_allele,,,,,, | ||
Sample1,D8S1179,14.0,14_12_1_0,869,real_allele,,,,,, | ||
Sample1,D8S1179,13.0,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212 | ||
Sample1,D8S1179,12.0,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201 | ||
Sample1,D9S1122,13.0,13_11,948,real_allele,,,,,, | ||
Sample1,D9S1122,12.0,12_10,108,-1_stutter,13_11,,948.0,,,0.114 | ||
Sample1,D9S1122,11.0,11_11,991,real_allele,,,,,, | ||
Sample1,D9S1122,10.0,10_10,87,-1_stutter,11_11,,991.0,,,0.088 | ||
Sample1,FGA,23.0,23_15_3_0,1436,real_allele,,,,,, | ||
Sample1,FGA,22.0,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182 | ||
Sample1,FGA,21.0,21_13_3_0,48,BelowAT,,,,,0.013, | ||
Sample1,FGA,20.0,20_12_3_0,1750,real_allele,,,,,, | ||
Sample1,FGA,18.0,18_10_3_0,181,real_allele,,,,,, | ||
Sample1,FGA,17.0,17_9_3_0,15,BelowAT,,,,,0.004, | ||
Sample1,PENTA D,15.0,15_15,50,real_allele,,,,,, | ||
Sample1,PENTA D,13.0,13_13,1000,real_allele,,,,,, | ||
Sample1,PENTA E,7.0,7_7,505,real_allele,,,,,, | ||
Sample1,TH01,7.0,7_7,2197,real_allele,,,,,, | ||
Sample1,TH01,6.0,6_6,1632,real_allele,,,,,, | ||
Sample1,TH01,5.0,5_5,66,BelowAT,,,,,0.017, | ||
Sample1,TPOX,11.0,11_11,15,BelowAT,,,,,1.0, |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,28 @@ | ||
Locus,CE Allele,Allele Seq,Reads | ||
D4S2408,8,ATCTATCTATCTATCTATCTATCTATCTATCT,1000 | ||
D4S2408,9,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 | ||
D4S2408,10,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900 | ||
D8S1179,12,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 | ||
D8S1179,12,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11 | ||
D8S1179,13,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 | ||
D8S1179,13,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 | ||
D8S1179,14,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 | ||
D8S1179,14,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130 | ||
D9S1122,10,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87 | ||
D9S1122,11,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991 | ||
D9S1122,12,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108 | ||
D9S1122,13,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948 | ||
FGA,17,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15 | ||
FGA,18,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181 | ||
FGA,20,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750 | ||
FGA,21,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48 | ||
FGA,22,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262 | ||
FGA,23,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436 | ||
PentaD,13,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 | ||
PentaD,15,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50 | ||
PentaE,7,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 | ||
TH01,5,AATGAATGAATGAATGAATG,66 | ||
TH01,6,AATGAATGAATGAATGAATGAATG,1632 | ||
TH01,7,AATGAATGAATGAATGAATGAATGAATG,2197 | ||
TPOX,11,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15 | ||
vWA,16,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6 | ||
D4S2408,8.0,ATCTATCTATCTATCTATCTATCTATCTATCT,1000 | ||
D4S2408,9.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 | ||
D4S2408,10.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900 | ||
D8S1179,12.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 | ||
D8S1179,12.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11 | ||
D8S1179,13.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 | ||
D8S1179,13.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 | ||
D8S1179,14.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 | ||
D8S1179,14.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130 | ||
D9S1122,10.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87 | ||
D9S1122,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991 | ||
D9S1122,12.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108 | ||
D9S1122,13.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948 | ||
FGA,17.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15 | ||
FGA,18.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181 | ||
FGA,20.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750 | ||
FGA,21.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48 | ||
FGA,22.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262 | ||
FGA,23.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436 | ||
PentaD,13.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 | ||
PentaD,15.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50 | ||
PentaE,7.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 | ||
TH01,5.0,AATGAATGAATGAATGAATG,66 | ||
TH01,6.0,AATGAATGAATGAATGAATGAATG,1632 | ||
TH01,7.0,AATGAATGAATGAATGAATGAATGAATG,2197 | ||
TPOX,11.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15 | ||
vWA,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,8 +12,6 @@ | |
|
||
import csv | ||
import json | ||
import math | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import os | ||
import pandas as pd | ||
|
@@ -23,7 +21,6 @@ | |
from lusSTR.scripts.repeat import collapse_all_repeats, collapse_repeats_by_length | ||
from lusSTR.scripts.repeat import sequence_to_bracketed_form, split_by_n | ||
from lusSTR.scripts.repeat import reverse_complement, reverse_complement_bracketed | ||
from matplotlib.backends.backend_pdf import PdfPages | ||
from pathlib import Path | ||
|
||
|
||
|
@@ -101,9 +98,8 @@ def format_table(input, software, kit="forenseq"): | |
] | ||
flanks_list.append(flank_summary) | ||
continue | ||
|
||
marker = STRMarkerObject(locus, sequence, software, kit=kit) | ||
if locus == "D12S391" and kit == "powerseq": | ||
if locus == "D12S391" and kit == "powerseq" and software == "straitrazor": | ||
if "." in str(marker.canonical): | ||
check_sr += 1 | ||
if check_sr > 10: | ||
Comment on lines
-106
to
105
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking through the code I realized if I was going to be running a check on strait razor data I should make sure that strait razor was actually used (and not gene marker). |
||
|
@@ -185,48 +181,6 @@ def sort_table(table): | |
return sorted_table | ||
|
||
|
||
def marker_plots(df, output_name, sex=False): | ||
Path("MarkerPlots").mkdir(parents=True, exist_ok=True) | ||
df["CE_Allele"] = df["CE_Allele"].astype(float) | ||
for id in df["SampleID"].unique(): | ||
sample_id = f"{id}_sexchr" if sex else id | ||
with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: | ||
make_plot(df, id, sex, sameyaxis=False) | ||
pdf.savefig() | ||
make_plot(df, id, sex) | ||
pdf.savefig() | ||
|
||
|
||
def make_plot(df, id, sex=False, sameyaxis=True): | ||
sample_df = df[df["SampleID"] == id] | ||
sample_id = f"{id}_sexchr" if sex else id | ||
max_reads = max(sample_df["Reads"]) | ||
n = 100 if max_reads > 1000 else 10 | ||
max_yvalue = int(math.ceil(max_reads / n)) * n | ||
increase_value = int(math.ceil((max_yvalue / 5)) / n) * n | ||
fig = plt.figure(figsize=(31, 31)) if sex is True else plt.figure(figsize=(30, 30)) | ||
n = 0 | ||
for marker in sample_df["Locus"].unique(): | ||
n += 1 | ||
marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") | ||
ax = fig.add_subplot(6, 6, n) if sex is True else fig.add_subplot(6, 5, n) | ||
ax.bar(marker_df["CE_Allele"], marker_df["Reads"]) | ||
if sameyaxis: | ||
ax.set_yticks(np.arange(0, max_yvalue, increase_value)) | ||
ax.set_xticks( | ||
np.arange(min(marker_df["CE_Allele"]) - 1, max(marker_df["CE_Allele"]) + 2, 1.0) | ||
) | ||
ax.title.set_text(marker) | ||
if sameyaxis: | ||
plt.text( | ||
0.4, 0.95, "Marker Plots With Same Y-Axis Scale", transform=fig.transFigure, size=24 | ||
) | ||
else: | ||
plt.text( | ||
0.4, 0.95, "Marker Plots With Custom Y-Axis Scale", transform=fig.transFigure, size=24 | ||
) | ||
|
||
|
||
def main(input, out, kit, software, sex, nocombine): | ||
input = str(input) | ||
out = str(out) | ||
|
@@ -248,7 +202,6 @@ def main(input, out, kit, software, sex, nocombine): | |
sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) | ||
else: | ||
sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) | ||
marker_plots(sex_final_table, output_name, sex=True) | ||
if software != "uas": | ||
if not autosomal_final_table.empty: | ||
autosomal_flank_table.to_csv(f"{output_name}_flanks.txt", sep="\t", index=False) | ||
|
@@ -260,7 +213,6 @@ def main(input, out, kit, software, sex, nocombine): | |
autosomal_final_table.to_csv(out, sep="\t", index=False) | ||
else: | ||
autosomal_final_table.to_csv(out, sep="\t", index=False) | ||
marker_plots(autosomal_final_table, output_name) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ran into a problem with this code because the marker plots use the CE allele for plotting- removing it so it can actually create the plots.