Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Snakemake workflow #52

Merged
merged 39 commits into from
Apr 21, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
0dde64c
created snakefile
rnmitchell Mar 9, 2023
b7d759c
cleaned up code and fixed rule running x2
rnmitchell Mar 9, 2023
88eeb8c
fixed double run error
rnmitchell Mar 9, 2023
85cadca
updated code to accomodate EFM different outputs [skip ci]
rnmitchell Mar 10, 2023
f114939
fixed failing tests
rnmitchell Mar 10, 2023
1eff1e2
reformatted snakemake
rnmitchell Mar 28, 2023
835f352
fixed main error
rnmitchell Mar 29, 2023
c4c4986
fixing errors [skip ci]
rnmitchell Mar 29, 2023
5e34778
fixed str scripts
rnmitchell Apr 4, 2023
f638e21
fixed STR cli issue [skip ci]
rnmitchell Apr 4, 2023
656de31
added command for copy config file to working dir
rnmitchell Apr 4, 2023
fbd4841
updated manifest and setup.py [skip ci]
rnmitchell Apr 4, 2023
72ad24c
added target for cli [skip ci]
rnmitchell Apr 5, 2023
9869f4d
added option to change config file using cl arguments
rnmitchell Apr 5, 2023
52ea09c
updated clis [skip ci]
rnmitchell Apr 5, 2023
9c32166
updating code to accomodate nocombine option [skip ci]
rnmitchell Apr 14, 2023
09e6faa
updated test_repeat.py
rnmitchell Apr 17, 2023
5f87672
updated format tests [skip ci]
rnmitchell Apr 17, 2023
f468e57
updated test_marker.py [skip ci]
rnmitchell Apr 17, 2023
eeee192
fixed all tests for test_suite.py [skip ci]
rnmitchell Apr 18, 2023
913750a
fixed error in test file [skip ci]
rnmitchell Apr 18, 2023
f78b560
updated snakefile to extract sampleIDs from format or annot output file
rnmitchell Apr 18, 2023
6a59b6e
updated snakefile to assign correct output name for filter step [skip…
rnmitchell Apr 18, 2023
ea3efa1
update tests [skip ci]
rnmitchell Apr 18, 2023
ec99cd7
fixed format and removed separate from annot.py [skip ci]
rnmitchell Apr 18, 2023
82bc6ae
removed separate from config for annotate step [skip ci]
rnmitchell Apr 18, 2023
176672e
updated config file [skip ci]
rnmitchell Apr 19, 2023
effe222
remove .DS_Store file [skip ci]
rnmitchell Apr 19, 2023
d5af9bd
updated README [skip ci]
rnmitchell Apr 19, 2023
7cb34b9
add error message for snps workflow, update setup.py [skip ci]
rnmitchell Apr 20, 2023
a0bbbba
removed old argument from config.py [skip ci]
rnmitchell Apr 20, 2023
ea86844
changed all "annotate" to "convert" [skip ci]
rnmitchell Apr 20, 2023
26433e0
updated default config [skip ci]
rnmitchell Apr 20, 2023
1fec516
banishing annotate and anno in tests and test files [skip ci]
rnmitchell Apr 20, 2023
83ddecf
missed files [skip ci]
rnmitchell Apr 20, 2023
bce9efc
NotImplementedError (change to trigger a CI build)
Apr 21, 2023
d483f8e
pinned pandas version and deselected snp tests
rnmitchell Apr 21, 2023
93f9ed3
Merge branch 'snakemake' of https://github.com/bioforensics/lusSTR in…
rnmitchell Apr 21, 2023
abbfc30
added missing test files
rnmitchell Apr 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions lusSTR/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
%YAML 1.2
---

## general settings
uas: True ## True/False; if ran through UAS
sex: False ## True/False; include sex-chromosome STRs
output: "test/output_test" ## output file/directory name; Example: "test_030923"

##format settings
samp_input: "/Users/rebecca.mitchell/Documents/Human/lusSTR/lusSTR/tests/data/UAS_bulk_input/" ## input directory or sample

##annotate settings
kit: "forenseq" ## forenseq/powerseq
nocombine: False ## True/False; do not combine identical sequences (if using STRait Razor data)
separate: False ## True/False; create individual files for each sample

##filter settings
output_type: "efm" ## strmix/efm
profile_type: "evidence" ## evidence/reference
data_type: "ce" ## ce/ngs
info: True ## True/False; create allele information file
filter_sep: False ##True/False; for EFM only, if True will create individual files for samples; if False, will create one file with all samples
nofilters: False ##True/False; skip all filtering steps
38 changes: 17 additions & 21 deletions lusSTR/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def EFM_output(profile, outfile, profile_type, separate=False):
profile = profile[profile.allele_type != "BelowAT"]
efm_profile = populate_efm_profile(profile)
if separate:
write_sample_specific_efm_profiles(efm_profile, profile_type)
write_sample_specific_efm_profiles(efm_profile, profile_type, outfile)
else:
write_aggregate_efm_profile(efm_profile, profile_type, outfile)

Expand Down Expand Up @@ -156,13 +156,13 @@ def populate_efm_profile(profile):
return efm_profile


def write_sample_specific_efm_profiles(efm_profile, profile_type, outdir="Separated_EFM_Files"):
Path(outdir).mkdir(exist_ok=True)
def write_sample_specific_efm_profiles(efm_profile, profile_type, outdir):
Path(outdir).mkdir(parents=True, exist_ok=True)
for sample in efm_profile.SampleName:
sample_profile = efm_profile[efm_profile.SampleName == sample]
sample_profile = efm_profile[efm_profile.SampleName == sample].reset_index(drop=True)
sample_profile.dropna(axis=1, how="all", inplace=True)
if profile_type == "evidence":
sample_profile.to_csv(f"Separated_EFM_Files/{sample}.csv", index=False)
sample_profile.to_csv(f"{outdir}/{sample}_evidence_ce.csv", index=False)
else:
num_alleles = (len(sample_profile.columns) - 2) / 2
if num_alleles > 2:
Expand All @@ -175,18 +175,19 @@ def write_sample_specific_efm_profiles(efm_profile, profile_type, outdir="Separa
for i in range(len(sample_profile)):
if pd.isna(sample_profile.loc[i, "Allele2"]):
sample_profile.loc[i, "Allele2"] = sample_profile.loc[i, "Allele1"]
sample_profile.iloc[:, :4].to_csv(f"Separated_EFM_Files/{id}.csv", index=False)
sample_profile.iloc[:, :4].to_csv(f"{outdir}/{sample}_reference_ce.csv", index=False)


def write_aggregate_efm_profile(efm_profile, profile_type, outfile):
Path(outfile).mkdir(parents=True, exist_ok=True)
name = os.path.basename(outfile)
if profile_type == "evidence":
efm_profile.to_csv(outfile, index=False)
efm_profile.to_csv(f"{outfile}/{name}_evidence_ce.csv", index=False)
else:
for i in range(len(efm_profile)):
if pd.isna(efm_profile.loc[i, "Allele2"]):
efm_profile.loc[i, "Allele2"] = efm_profile.loc[i, "Allele1"]
prefix = outfile.replace(".csv", "")
efm_profile.iloc[:, :4].to_csv(f"{prefix}_reference.csv", index=False)
efm_profile.iloc[:, :4].to_csv(f"{outfile}/{name}_reference_ce.csv", index=False)


def determine_max_num_alleles(allele_heights):
Expand All @@ -199,6 +200,7 @@ def determine_max_num_alleles(allele_heights):


def STRmix_output(profile, outdir, profile_type, data_type):
Path(outdir).mkdir(parents=True, exist_ok=True)
if profile_type == "reference":
filtered_df = profile[profile.allele_type == "real_allele"]
else:
Expand All @@ -221,7 +223,7 @@ def STRmix_output(profile, outdir, profile_type, data_type):
for id in id_list:
sample_df = strmix_profile[strmix_profile["SampleID"] == id].reset_index(drop=True)
if profile_type == "evidence":
sample_df.iloc[:, 1:].to_csv(f"{outdir}/{id}_{data_type}.csv", index=False)
sample_df.iloc[:, 1:].to_csv(f"{outdir}/{id}_evidence_{data_type}.csv", index=False)
else:
reference_df = reference_table(sample_df, data_type)
reference_df.to_csv(f"{outdir}/{id}_reference_{data_type}.csv", index=False)
Expand Down Expand Up @@ -303,7 +305,7 @@ def main(args):
raise ValueError(f"unknown output type '{output_type}'")
full_df = pd.read_csv(args.input, sep="\t")
if args.out is None:
outpath = sys.stdout
raise ValueError("No output specified using --out.")
else:
outpath = args.out
if args.nofilters:
Expand All @@ -320,13 +322,7 @@ def main(args):
else:
STRmix_output(final_df, outpath, profile_type, data_type)
if args.info:
if outpath != sys.stdout:
if output_type == "efm":
outputname = outpath.replace(".csv", "_")
else:
outputname = f"{outpath}/"
final_df.to_csv(f"{outputname}sequence_info.csv", index=False)
if not flags_df.empty:
flags_df.to_csv(f"{outputname}Flagged_Loci.csv", index=False)
else:
raise ValueError("No outfile provided. Please specify --out to create info file.")
name = os.path.basename(outpath)
final_df.to_csv(f"{outpath}/{name}_sequence_info.csv", index=False)
if not flags_df.empty:
flags_df.to_csv(f"{outpath}/{name}_Flagged_Loci.csv", index=False)
37 changes: 13 additions & 24 deletions lusSTR/tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,10 @@ def test_EFMoutput_format(tmp_path):
input_file = data_file("test_stutter.txt")
exp_out = data_file("RU_stutter_test/test_filtering_EFMoutput.csv")
exp_info_out = data_file("RU_stutter_test/test_filtering_EFMoutput_sequence_info.csv")
obs_out = str(tmp_path / "test_output.csv")
obs_info_out = str(tmp_path / "test_output_sequence_info.csv")
arglist = ["filter", "-o", obs_out, "--output-type", "efm", "--info", input_file]
output = str(tmp_path / "test_output")
obs_out = str(tmp_path / "test_output/test_output_evidence_ce.csv")
obs_info_out = str(tmp_path / "test_output/test_output_sequence_info.csv")
arglist = ["filter", "-o", output, "--output-type", "efm", "--info", input_file]
args = lusSTR.cli.get_parser().parse_args(arglist)
lusSTR.filter.main(args)
assert filecmp.cmp(exp_out, obs_out) is True
Expand All @@ -174,8 +175,8 @@ def test_STRmixoutput_format(outputdir, datatype, tmp_path):
exp_out = data_file(f"{outputdir}Sample1_{datatype}.csv")
exp_info_out = data_file(f"{outputdir}STRmix_Files_sequence_info.csv")
obs_outdir = str(tmp_path / "STRmix_Files")
obs_out = str(tmp_path / f"STRmix_Files/Sample1_{datatype}.csv")
obs_info_out = str(tmp_path / f"STRmix_Files/sequence_info.csv")
obs_out = str(tmp_path / f"STRmix_Files/Sample1_evidence_{datatype}.csv")
obs_info_out = str(tmp_path / f"STRmix_Files/STRmix_Files_sequence_info.csv")
arglist = [
"filter",
"-o",
Expand All @@ -193,23 +194,10 @@ def test_STRmixoutput_format(outputdir, datatype, tmp_path):
assert filecmp.cmp(exp_info_out, obs_info_out) is True


def test_stdout(capsys):
input_file = data_file("test_stutter.txt")
output = data_file("RU_stutter_test/test_filtering_EFMoutput.csv")
arglist = ["filter", "--output-type", "efm", input_file]
args = lusSTR.cli.get_parser().parse_args(arglist)
lusSTR.filter.main(args)
with open(output, "r") as fh:
exp_out = fh.read().strip()
terminal = capsys.readouterr()
obs_out = terminal.out.strip()
assert obs_out == exp_out


def test_nofilters(tmp_path):
input_file = data_file("test_stutter.txt")
exp_out = data_file("NGS_stutter_test/Sample1_nofilter.csv")
obs_out = str(tmp_path / "Sample1_ngs.csv")
obs_out = str(tmp_path / "Sample1_evidence_ngs.csv")
arglist = [
"filter",
"-o",
Expand All @@ -230,7 +218,7 @@ def test_flags(tmp_path):
input_file = data_file("test_stutter.txt")
exp_out = data_file("RU_stutter_test/Flagged_Loci.csv")
obs_outdir = str(tmp_path / "RU_stutter_test")
obs_out = str(tmp_path / "RU_stutter_test/Flagged_Loci.csv")
obs_out = str(tmp_path / "RU_stutter_test/RU_stutter_test_Flagged_Loci.csv")
arglist = ["filter", "-o", obs_outdir, "--output-type", "strmix", "--info", input_file]
args = lusSTR.cli.get_parser().parse_args(arglist)
lusSTR.filter.main(args)
Expand All @@ -240,8 +228,8 @@ def test_flags(tmp_path):
def test_efm_reference(tmp_path):
input_file = data_file("test_references.txt")
exp_out = data_file("RU_stutter_test/EFM_test_reference.csv")
obs_out = str(tmp_path / "test_output.csv")
obs_efm_out = str(tmp_path / "test_output_reference.csv")
obs_out = str(tmp_path / "test_output")
obs_efm_out = str(tmp_path / "test_output/test_output_reference_ce.csv")
arglist = [
"filter",
"-o",
Expand Down Expand Up @@ -284,8 +272,9 @@ def test_strmix_reference(outputdir, datatype, tmp_path):
def test_D7(tmp_path):
input_file = data_file("test_D7.txt")
exp_out = data_file("D7_microvariant_flagged.csv")
obs_out = str(tmp_path / "Flagged_Loci.csv")
arglist = ["filter", "-o", str(tmp_path), "--output-type", "strmix", "--info", input_file]
outpath = str(tmp_path / "test")
obs_out = str(tmp_path / "test/test_Flagged_Loci.csv")
arglist = ["filter", "-o", outpath, "--output-type", "strmix", "--info", input_file]
args = lusSTR.cli.get_parser().parse_args(arglist)
lusSTR.filter.main(args)
assert filecmp.cmp(exp_out, obs_out)
Expand Down
Binary file added lusSTR/workflow/.DS_Store
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope.

Binary file not shown.
121 changes: 121 additions & 0 deletions lusSTR/workflow/snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import glob
import openpyxl
import os
import pandas as pd
from pathlib import Path
import re


configfile: "config.yaml"
output_name = config["output"]
input_name = config["samp_input"]
software = config["output_type"]
prof = config["profile_type"]
data = config["data_type"]
filter_sep = config["filter_sep"]


def get_sample_IDs(input, uas, output, software, separate):
file_ext = ".xlsx" if uas is True else ".txt"
if software == "efm" and separate is False:
return os.path.basename(output)
else:
if uas is True:
if os.path.isdir(input):
files = glob.glob(os.path.join(input, f"[!~]*{file_ext}"))
else:
files = input
ID_list = get_uas_ids(files)
else:
if os.path.isdir(input):
files = glob.glob(os.path.join(input, f"[!~]*{file_ext}"))
else:
files = input
files = [sub.replace(dir, "") for sub in files]
ID_list = [sub.replace(file_ext, "") for sub in files]
return ID_list


def get_uas_ids(files):
samplelist = []
if isinstance(files, list):
for filename in sorted(files):
if "Sample Details" not in filename:
continue
sampleID = parse_sample_details(filename)
samplelist.append(sampleID)
else:
samplelist = parse_sample_details(files)
return samplelist


def parse_sample_details(filename):
file = openpyxl.load_workbook(filename)
file_sheet = file["Autosomal STRs"]
table = pd.DataFrame(file_sheet.values)
sampleID = re.sub(" ", "_", table.iloc[2, 1])
return sampleID


rule all:
input:
expand("{name}.csv", name=output_name),
expand("{name}.txt", name=output_name),
expand(
"{outdir}/{samplename}_{prof_t}_{data_t}.csv", outdir=output_name,
samplename=get_sample_IDs(input_name, config["uas"], output_name, software,
filter_sep), prof_t=prof, data_t=data
)


rule format:
input:
expand("{samp_input}", samp_input=input_name)
output:
expand("{name}.csv", name=output_name)
params:
uas="--uas" if config["uas"] is True else "",
sex="--include-sex" if config["sex"] is True else ""
shell:
"lusstr format '{input}' -o {output} {params.uas} {params.sex}"


rule annotate:
input:
rules.format.output
output:
expand("{name}.txt", name=output_name)
params:
uas="--uas" if config["uas"] is True else "",
sex="--include-sex" if config["sex"] is True else "",
combine="--nocombine" if config["nocombine"] is True else "",
separate="--separate" if config["separate"] is True else "",
kit=config["kit"]
shell:
"lusstr annotate {input} -o {output} --kit {params.kit} {params.uas} {params.sex} "
"{params.combine} {params.separate}"


rule filter:
input:
rules.annotate.output
output:
expand(
"{outdir}/{samplename}_{prof_t}_{data_t}.csv", outdir=output_name,
samplename=get_sample_IDs(input_name, config["uas"], output_name, software,
filter_sep), prof_t=prof, data_t=data
)
params:
output_type=config["output_type"],
profile_type=config["profile_type"],
data_type=config["data_type"],
output_dir=config["output"],
info="--info" if config["info"] is True else "",
filter_sep="--separate" if config["filter_sep"] is True else "",
filters="--no-filters" if config["nofilters"] is True else ""
shell:
"lusstr filter {input} -o {params.output_dir} --output-type {params.output_type} "
"--profile-type {params.profile_type} --data-type {params.data_type} {params.info} "
"{params.filters} {params.filter_sep}"