From dbceb5ae955d19d7d587c9765e06375457681e76 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Fri, 1 Dec 2023 14:34:54 -0500 Subject: [PATCH 1/6] added MPSproto as possible output file type [skip ci] --- lusSTR/cli/config.py | 12 ++++++++---- lusSTR/wrappers/filter.py | 21 +++++++++++++-------- setup.py | 3 ++- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/lusSTR/cli/config.py b/lusSTR/cli/config.py index a972c847..ad51049f 100644 --- a/lusSTR/cli/config.py +++ b/lusSTR/cli/config.py @@ -89,8 +89,8 @@ def edit_str_config(config, args): data["profile_type"] = "reference" if args.datatype: data["data_type"] = args.datatype - if args.efm: - data["output_type"] = "efm" + if args.software: + data["output_type"] = args.software if args.strand: data["strand"] = args.strand return data @@ -126,11 +126,15 @@ def subparser(subparsers): "--reference", action="store_true", help="Use for creating Reference profiles for STR workflow" ) - p.add_argument("--efm", action="store_true",help="Use to create EuroForMix profiles") + p.add_argument( + "--software", choices=["efm", "mpsproto", "strmix"], default="strmix", + help="Specify the probabilistic genotyping software package of choice. The final output" + " files will be in the correct format for direct use. Default is strmix." + ) p.add_argument( "--str-type", choices=["ce", "ngs", "lusplus"], default="ngs", dest="datatype", help="Data type for STRs. Options are: CE allele ('ce'), sequence " - "('ngs'), or LUS+ allele ('lusplus'). Default is 'ngs'.", + "or bracketed sequence form('ngs'), or LUS+ allele ('lusplus'). Default is 'ngs'.", ) p.add_argument( "--noinfo", action="store_true", diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 790b58f3..0ccc3c62 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -118,23 +118,25 @@ def process_strs(dict_loc, datatype, seq_col): return final_df, flags_df -def EFM_output(profile, outfile, profile_type, data_type, separate=False): +def EFM_output(profile, outfile, profile_type, data_type, col, separate=False): if profile_type == "reference": profile = profile[profile.allele_type == "real_allele"] else: profile = profile[profile.allele_type != "BelowAT"] - efm_profile = populate_efm_profile(profile, data_type) + efm_profile = populate_efm_profile(profile, data_type, col) if separate: write_sample_specific_efm_profiles(efm_profile, profile_type, data_type, outfile) else: write_aggregate_efm_profile(efm_profile, profile_type, data_type, outfile) -def populate_efm_profile(profile, data_type): +def populate_efm_profile(profile, data_type, colname): if data_type == "ce": prof_col = "CE_Allele" elif data_type == "lusplus": prof_col = "LUS_Plus" + elif data_type == "ngs": + prof_col = colname else: message = ( f"Incorrect data type {data_type} specified for EFM. Please choose either " @@ -328,7 +330,7 @@ def main( raise ValueError(f"unknown profile type '{profile_type}'") if data_type not in ("ce", "ngs", "lusplus"): raise ValueError(f"unknown data type '{data_type}'") - if output_type not in ("efm", "strmix"): + if output_type not in ("efm", "strmix", "mpsproto"): raise ValueError(f"unknown output type '{output_type}'") full_df = pd.read_csv(input, sep="\t") if output_dir is None: @@ -336,17 +338,20 @@ def main( else: outpath = output_dir seq_col = "UAS_Output_Sequence" if strand == "uas" else "Forward_Strand_Sequence" + brack_col = ( + "UAS_Output_Bracketed_Notation" if strand == "uas" else "Forward_Strand_Bracketed_Form" + ) if nofilters: full_df["allele_type"] = "real_allele" - if output_type == "efm": - EFM_output(full_df, outpath, profile_type, data_type, separate) + if output_type == "efm" or output_type == "mpsproto": + EFM_output(full_df, outpath, profile_type, data_type, brack_col, separate) else: STRmix_output(full_df, outpath, profile_type, data_type, seq_col) else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} final_df, flags_df = process_strs(dict_loc, data_type, seq_col) - if output_type == "efm": - EFM_output(final_df, outpath, profile_type, data_type, separate) + if output_type == "efm" or output_type == "mpsproto": + EFM_output(final_df, outpath, profile_type, data_type, brack_col, separate) else: STRmix_output(final_df, outpath, profile_type, data_type, seq_col) if info: diff --git a/setup.py b/setup.py index 4b5c79af..836d3836 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,8 @@ "lusSTR/tests/data/NGS_stutter_test/*", "lusSTR/tests/data/kinsnps/*", "lusSTR/tests/data/lusstr_output/*", - "lusSTR/tests/data/LUSPlus_stutter_test/*" "lusSTR/workflows/*", + "lusSTR/tests/data/LUSPlus_stutter_test/*", + "lusSTR/workflows/*", "lusSTR/wrappers/*", ] }, From c3113e0295e1ae03cbed394f0767d1e0d4368ccd Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 6 Dec 2023 09:14:08 -0500 Subject: [PATCH 2/6] updated snakemake file [skip ci] --- lusSTR/workflows/strs.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index c9955579..86f7a526 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -19,7 +19,7 @@ separate = config["separate"] def get_sample_IDs(input, uas, output, software, separate): convert_out = f"{output}.txt" format_out = f"{output}.csv" - if software == "efm" and separate is False: + if (software == "efm" or software == "mpsproto") and separate is False: ID_list = os.path.basename(output) elif os.path.exists(convert_out): ID_list = get_existing_IDs(convert_out, "\t") From 19ed704e7d931e179d6b247970a01f8238b2ae79 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 6 Dec 2023 09:30:29 -0500 Subject: [PATCH 3/6] added tests for MPSproto [skip ci] --- .../MPSProto_test/EFM_test_reference_ngs.csv | 28 +++++++++++++++++++ .../test_filtering_EFMoutput_ngs.csv | 28 +++++++++++++++++++ ...test_filtering_EFMoutput_sequence_info.csv | 26 +++++++++++++++++ lusSTR/tests/test_filters.py | 24 ++++++++++++---- 4 files changed, 100 insertions(+), 6 deletions(-) create mode 100644 lusSTR/tests/data/MPSProto_test/EFM_test_reference_ngs.csv create mode 100644 lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_ngs.csv create mode 100644 lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_sequence_info.csv diff --git a/lusSTR/tests/data/MPSProto_test/EFM_test_reference_ngs.csv b/lusSTR/tests/data/MPSProto_test/EFM_test_reference_ngs.csv new file mode 100644 index 00000000..a5094a1b --- /dev/null +++ b/lusSTR/tests/data/MPSProto_test/EFM_test_reference_ngs.csv @@ -0,0 +1,28 @@ +SampleName,Marker,Allele1,Allele2 +Positive_Control,CSF1PO,[AGAT]12,[AGAT]12 +Positive_Control,D10S1248,[GGAA]13,[GGAA]15 +Positive_Control,D12S391,[AGAT]11 [AGAC]6 AGAT,[AGAT]14 [AGAC]9 +Positive_Control,D13S317,[TATC]12 AATC [ATCT]3 TTCT GTCT GTC,[TATC]9 [AATC]2 [ATCT]3 TTCT GTCT GTC +Positive_Control,D16S539,[GATA]13,[GATA]9 +Positive_Control,D17S1301,[AGAT]11,[AGAT]12 +Positive_Control,D18S51,[AGAA]16 AAAG AGAG AG,[AGAA]18 AAAG AGAG AG +Positive_Control,D19S433,AAGG AAAG AAGG TAGG [AAGG]11 AGAG AGGA AGAA AGAG AG,AAGG AAAG AAGG TAGG [AAGG]12 AGAG AGGA AGAA AGAG AG +Positive_Control,D1S1656,[TAGA]11 TAGG [TGTG]2 TG,[TAGA]13 [TGTG]2 TG +Positive_Control,D20S482,[AGAT]14,[AGAT]15 +Positive_Control,D21S11,[TCTA]4 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11,[TCTA]5 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 TA TCTA +Positive_Control,D22S1045,[ATT]13 ACT [ATT]2,[ATT]13 ACT [ATT]2 +Positive_Control,D2S1338,[TGCC]7 [TTCC]12 GTCC [TTCC]2,[TGCC]7 [TTCC]15 GTCC [TTCC]2 +Positive_Control,D2S441,[TCTA]10,[TCTA]11 TTTA [TCTA]2 +Positive_Control,D3S1358,TCTA [TCTG]3 [TCTA]13,TCTA [TCTG]3 [TCTA]14 +Positive_Control,D4S2408,[ATCT]9,[ATCT]9 +Positive_Control,D5S818,[AGAT]12 AGAG,[AGAT]12 AGAG +Positive_Control,D6S1043,[AGAT]12,[AGAT]14 ACAT [AGAT]5 +Positive_Control,D7S820,[GATA]11 GACA GATT GATA GTTT,[GATA]8 GACA GATT GATA GTTT +Positive_Control,D8S1179,TCTA TCTG [TCTA]12,[TCTA]2 TCTG [TCTA]12 +Positive_Control,D9S1122,TAGA TCGA [TAGA]10,[TAGA]12 +Positive_Control,FGA,[TTTC]3 TTTT TTCT [CTTT]12 CTCC [TTCC]2,[TTTC]3 TTTT TTCT [CTTT]15 CTCC [TTCC]2 +Positive_Control,PENTA D,AAAAG [AAAGA]12,AAAAG [AAAGA]13 +Positive_Control,PENTA E,[AAAGA]14,[AAAGA]7 +Positive_Control,TH01,[AATG]6,[AATG]6 ATG [AATG]3 +Positive_Control,TPOX,[AATG]11,[AATG]11 +Positive_Control,VWA,TCTA [TCTG]3 [TCTA]12 TCCA TCTA,TCTA [TCTG]4 [TCTA]14 TCCA TCTA diff --git a/lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_ngs.csv b/lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_ngs.csv new file mode 100644 index 00000000..ef06e8a2 --- /dev/null +++ b/lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_ngs.csv @@ -0,0 +1,28 @@ +SampleName,Marker,Allele1,Allele2,Allele3,Allele4,Height1,Height2,Height3,Height4 +Sample1,CSF1PO,,,,,,,, +Sample1,D10S1248,,,,,,,, +Sample1,D12S391,,,,,,,, +Sample1,D13S317,,,,,,,, +Sample1,D16S539,,,,,,,, +Sample1,D17S1301,,,,,,,, +Sample1,D18S51,,,,,,,, +Sample1,D19S433,,,,,,,, +Sample1,D1S1656,,,,,,,, +Sample1,D20S482,,,,,,,, +Sample1,D21S11,,,,,,,, +Sample1,D22S1045,,,,,,,, +Sample1,D2S1338,,,,,,,, +Sample1,D2S441,,,,,,,, +Sample1,D3S1358,,,,,,,, +Sample1,D4S2408,[ATCT]10,[ATCT]8,[ATCT]9,,900,1000,1357, +Sample1,D5S818,,,,,,,, +Sample1,D6S1043,,,,,,,, +Sample1,D7S820,,,,,,,, +Sample1,D8S1179,TCTA TCTG [TCTA]11,[TCTA]2 TCTG [TCTA]10,[TCTA]2 TCTG [TCTA]11,[TCTA]2 TCTG [TCTA]9,95,89,739,26 +Sample1,D9S1122,TAGA TCGA [TAGA]10,TAGA TCGA [TAGA]11,[TAGA]10,[TAGA]11,108,948,87,991 +Sample1,FGA,[TTTC]3 TTTT TTCT [CTTT]10 CTCC [TTCC]2,[TTTC]3 TTTT TTCT [CTTT]12 CTCC [TTCC]2,[TTTC]3 TTTT TTCT [CTTT]14 CTCC [TTCC]2,[TTTC]3 TTTT TTCT [CTTT]15 CTCC [TTCC]2,181,1750,262,1436 +Sample1,PENTA D,AAAAG [AAAGA]13,,,,1000,,, +Sample1,PENTA E,[AAAGA]7,,,,505,,, +Sample1,TH01,[AATG]6,[AATG]7,,,1632,2197,, +Sample1,TPOX,,,,,,,, +Sample1,VWA,,,,,,,, diff --git a/lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_sequence_info.csv b/lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_sequence_info.csv new file mode 100644 index 00000000..6d5ac112 --- /dev/null +++ b/lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_sequence_info.csv @@ -0,0 +1,26 @@ +SampleID,Locus,UAS_Output_Sequence,CE_Allele,UAS_Output_Bracketed_Notation,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter +Sample1,D4S2408,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,10.0,[ATCT]10,900,real_allele,,,,,, +Sample1,D4S2408,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,9.0,[ATCT]9,1357,real_allele,,,,,, +Sample1,D4S2408,ATCTATCTATCTATCTATCTATCTATCTATCT,8.0,[ATCT]8,1000,real_allele,,,,,, +Sample1,D8S1179,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,14.0,[TCTA]2 TCTG [TCTA]11,739,real_allele,,,,,, +Sample1,D8S1179,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,13.0,TCTA TCTG [TCTA]11,95,-1_stutter,[TCTA]2 TCTG [TCTA]11,,739.0,,,0.129 +Sample1,D8S1179,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,13.0,[TCTA]2 TCTG [TCTA]10,89,-1_stutter,[TCTA]2 TCTG [TCTA]11,,739.0,,,0.12 +Sample1,D8S1179,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,12.0,[TCTA]2 TCTG [TCTA]9,26,-2_stutter,[TCTA]2 TCTG [TCTA]11,,739.0,,,0.035 +Sample1,D8S1179,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,12.0,TCTA TCTG [TCTA]10,11,BelowAT,,,,,0.01, +Sample1,D9S1122,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,13.0,TAGA TCGA [TAGA]11,948,real_allele,,,,,, +Sample1,D9S1122,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,12.0,TAGA TCGA [TAGA]10,108,-1_stutter,TAGA TCGA [TAGA]11,,948.0,,,0.114 +Sample1,D9S1122,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,11.0,[TAGA]11,991,real_allele,,,,,, +Sample1,D9S1122,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,10.0,[TAGA]10,87,-1_stutter,[TAGA]11,,991.0,,,0.088 +Sample1,FGA,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,23.0,[TTTC]3 TTTT TTCT [CTTT]15 CTCC [TTCC]2,1436,real_allele,,,,,, +Sample1,FGA,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,22.0,[TTTC]3 TTTT TTCT [CTTT]14 CTCC [TTCC]2,262,-1_stutter,[TTTC]3 TTTT TTCT [CTTT]15 CTCC [TTCC]2,,1436.0,,,0.182 +Sample1,FGA,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,21.0,[TTTC]3 TTTT TTCT [CTTT]13 CTCC [TTCC]2,48,BelowAT,,,,,0.013, +Sample1,FGA,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,20.0,[TTTC]3 TTTT TTCT [CTTT]12 CTCC [TTCC]2,1750,real_allele,,,,,, +Sample1,FGA,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,18.0,[TTTC]3 TTTT TTCT [CTTT]10 CTCC [TTCC]2,181,real_allele,,,,,, +Sample1,FGA,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,17.0,[TTTC]3 TTTT TTCT [CTTT]9 CTCC [TTCC]2,15,BelowAT,,,,,0.004, +Sample1,PENTA D,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,15.0,AAAAG [AAAGA]13,50,real_allele,,,,,, +Sample1,PENTA D,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,13.0,AAAAG [AAAGA]13,1000,real_allele,,,,,, +Sample1,PENTA E,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,7.0,[AAAGA]7,505,real_allele,,,,,, +Sample1,TH01,AATGAATGAATGAATGAATGAATGAATG,7.0,[AATG]7,2197,real_allele,,,,,, +Sample1,TH01,AATGAATGAATGAATGAATGAATG,6.0,[AATG]6,1632,real_allele,,,,,, +Sample1,TH01,AATGAATGAATGAATGAATG,5.0,[AATG]5,66,BelowAT,,,,,0.017, +Sample1,TPOX,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,11.0,[AATG]11,15,BelowAT,,,,,1.0, diff --git a/lusSTR/tests/test_filters.py b/lusSTR/tests/test_filters.py index d5a85681..4b547664 100644 --- a/lusSTR/tests/test_filters.py +++ b/lusSTR/tests/test_filters.py @@ -158,9 +158,14 @@ def test_plus1stutter( @pytest.mark.parametrize( - "outputdir, datatype", [("RU_stutter_test/", "ce"), ("LUSPlus_stutter_test/", "lusplus")] + "outputdir, datatype, software", + [ + ("RU_stutter_test/", "ce", "efm"), + ("LUSPlus_stutter_test/", "lusplus", "efm"), + ("MPSProto_test/", "ngs", "mpsproto"), + ], ) -def test_EFMoutput_format(outputdir, datatype, tmp_path): +def test_EFMoutput_format(outputdir, datatype, software, tmp_path): str_path = str(tmp_path / "WD") inputfile = data_file("test_stutter.txt") exp_out = data_file(f"{outputdir}test_filtering_EFMoutput_{datatype}.csv") @@ -173,7 +178,8 @@ def test_EFMoutput_format(outputdir, datatype, tmp_path): str_path, "-o", "test_output", - "--efm", + "--software", + software, "--str-type", datatype, "--input", @@ -246,9 +252,14 @@ def test_flags(tmp_path): @pytest.mark.parametrize( - "outputdir, datatype", [("RU_stutter_test/", "ce"), ("LUSPlus_stutter_test/", "lusplus")] + "outputdir, datatype, software", + [ + ("RU_stutter_test/", "ce", "efm"), + ("LUSPlus_stutter_test/", "lusplus", "efm"), + ("MPSProto_test/", "ngs", "mpsproto"), + ], ) -def test_efm_reference(outputdir, datatype, tmp_path): +def test_efm_reference(outputdir, datatype, software, tmp_path): str_path = str(tmp_path / "WD") inputfile = data_file("test_references.txt") exp_out = data_file(f"{outputdir}EFM_test_reference_{datatype}.csv") @@ -259,7 +270,8 @@ def test_efm_reference(outputdir, datatype, tmp_path): str_path, "--input", "WD", - "--efm", + "--software", + software, "--reference", "--str-type", datatype, From 0cccd3d282ff9f679706327a6c6fb3c4d8b893f5 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 6 Dec 2023 09:33:40 -0500 Subject: [PATCH 4/6] updated setup.py and manifest [skip ci] --- MANIFEST.in | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 21e04336..87da302c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -12,3 +12,4 @@ include lusSTR/tests/data/NGS_stutter_test/* include lusSTR/tests/data/kinsnps/* include lusSTR/tests/data/lusstr_output/* include lusSTR/tests/data/LUSPlus_stutter_test/* +include lusSTR/tests/data/MPSProto_test/* diff --git a/setup.py b/setup.py index 836d3836..43c6f4da 100755 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ "lusSTR/tests/data/kinsnps/*", "lusSTR/tests/data/lusstr_output/*", "lusSTR/tests/data/LUSPlus_stutter_test/*", + "lusSTR/tests/data/MPSProto_test/*", "lusSTR/workflows/*", "lusSTR/wrappers/*", ] From 2e70d0097450e9c501a2103dddee47abe40bc121 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 6 Dec 2023 09:56:04 -0500 Subject: [PATCH 5/6] updated tests and README --- README.md | 10 +++++----- .../EFM_test_reference_ngs.csv | 0 .../test_filtering_EFMoutput_ngs.csv | 0 .../test_filtering_EFMoutput_sequence_info.csv | 0 lusSTR/tests/test_filters.py | 7 ++++--- 5 files changed, 9 insertions(+), 8 deletions(-) rename lusSTR/tests/data/{MPSProto_test => MPSproto_test}/EFM_test_reference_ngs.csv (100%) rename lusSTR/tests/data/{MPSProto_test => MPSproto_test}/test_filtering_EFMoutput_ngs.csv (100%) rename lusSTR/tests/data/{MPSProto_test => MPSproto_test}/test_filtering_EFMoutput_sequence_info.csv (100%) diff --git a/README.md b/README.md index eed6814e..0d1c9e82 100755 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ lusSTR is a tool written in Python to convert NGS sequence data of forensic STR loci to different sequence representations (sequence bracketed form) and allele designations (CE allele, LUS/LUS+ alleles) for ease in downstream analyses. See the below section ```Converting STR sequences to other sequence representations and allele designations``` for more information. -Further, lusSTR can perform filtering and stutter identification using the CE allele, the LUS+ allele, or the bracketed sequence form for autosomal loci and create files for direct input into two probabilistic genotyping software packages, EuroForMix (CE and LUS+) and STRmix (CE and NGS). +Further, lusSTR can perform filtering and stutter identification using the CE allele, the LUS+ allele, or the bracketed sequence form for autosomal loci and create files for direct input into three probabilistic genotyping software packages, EuroForMix (CE and LUS+), MPSproto (NGS), and STRmix (CE and NGS). lusSTR also processes SNP data from the Verogen ForenSeq and Kintelligence panels and create evidence and/or reference files for use in EFM. See the below section ```SNP Data Processing``` for more information. @@ -65,12 +65,12 @@ kit: ```forenseq``` (forenseq/powerseq) (invoke the ```--powerseq``` flag if usi nocombine: ```False``` (True/False); do not combine identical sequences during the ```convert``` step, if using STRait Razor data. (invoke the ```--nocombine``` flag) ### filter settings -output_type: ```strmix``` (strmix/efm) (invoke ```--efm``` flag if creating output for EuroForMix) +output_type: ```strmix``` (strmix/efm/mpsproto) (indicate using the ```--software``` flag) profile_type: ```evidence``` (evidence/reference) (invoke ```--reference``` flag if creating a reference output file) data_type: ```ngs``` (ce/ngs/lusplus) (indicate using the ```--str-type```) info: ```True``` (True/False); create allele information file (invoke ```--noinfo``` flag to not create the allele information file) -separate: ```False``` (True/False); for EFM only, if True will create individual files for samples; if False, will create one file with all samples (invoke ```--separate``` flag to separate EFM output files) -nofilters: ```False``` (True/False); skip all filtering steps but still creates EFM/STRmix output files (invoke ```--nofilters``` flag) +separate: ```False``` (True/False); for EFM/MPSproto only, if True will create individual files for samples; if False, will create one file with all samples (invoke ```--separate``` flag to separate EFM/MPSproto output files) +nofilters: ```False``` (True/False); skip all filtering steps but still creates EFM/MPSproto/STRmix output files (invoke ```--nofilters``` flag) strand: ```uas``` (uas/forward); indicates the strand orientation in which to report the sequence in the final output table for STRmix NGS only (indicate using ```--strand```) One additional argument can be provided with ```lusstr config```: @@ -189,7 +189,7 @@ Each locus is checked for containing greater than 2 alleles (indicating a potent When using STRmix data, the data type can be specified using the ```data-type``` setting as either ```ce```, ```ngs``` or ```lusplus``` (default is ```ngs```). If ```ngs``` or ```lusplus``` is specified, the same size filter is applied following the stutter filter. Further, the columns and column names in the output file differ based on the data type. -Finally, output files are created for direct use in EuroForMix (EFM) or STRmix. If EFM is specified, a single file is created containing all samples in the input file (however, separate output files for each sample can be created with the ```separate``` setting specified in the config file). If STRmix is specified, a directory containing files for each individual sample is created. The ```profile-type``` setting allows for the creation of either a ```reference``` or ```evidence``` profile. Both EuroForMix and STRmix require different formatting depending on the type of sample. +Finally, output files are created for direct use in EuroForMix (EFM), MPSproto or STRmix. If EFM or MPSproto is specified, a single file is created containing all samples in the input file (however, separate output files for each sample can be created with the ```separate``` setting specified in the config file). If STRmix is specified, a directory containing files for each individual sample is created. The ```profile-type``` setting allows for the creation of either a ```reference``` or ```evidence``` profile. Both EuroForMix/MPSproto and STRmix require different formatting depending on the type of sample. ___ diff --git a/lusSTR/tests/data/MPSProto_test/EFM_test_reference_ngs.csv b/lusSTR/tests/data/MPSproto_test/EFM_test_reference_ngs.csv similarity index 100% rename from lusSTR/tests/data/MPSProto_test/EFM_test_reference_ngs.csv rename to lusSTR/tests/data/MPSproto_test/EFM_test_reference_ngs.csv diff --git a/lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_ngs.csv b/lusSTR/tests/data/MPSproto_test/test_filtering_EFMoutput_ngs.csv similarity index 100% rename from lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_ngs.csv rename to lusSTR/tests/data/MPSproto_test/test_filtering_EFMoutput_ngs.csv diff --git a/lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_sequence_info.csv b/lusSTR/tests/data/MPSproto_test/test_filtering_EFMoutput_sequence_info.csv similarity index 100% rename from lusSTR/tests/data/MPSProto_test/test_filtering_EFMoutput_sequence_info.csv rename to lusSTR/tests/data/MPSproto_test/test_filtering_EFMoutput_sequence_info.csv diff --git a/lusSTR/tests/test_filters.py b/lusSTR/tests/test_filters.py index 4b547664..195f0e70 100644 --- a/lusSTR/tests/test_filters.py +++ b/lusSTR/tests/test_filters.py @@ -162,7 +162,7 @@ def test_plus1stutter( [ ("RU_stutter_test/", "ce", "efm"), ("LUSPlus_stutter_test/", "lusplus", "efm"), - ("MPSProto_test/", "ngs", "mpsproto"), + ("MPSproto_test/", "ngs", "mpsproto"), ], ) def test_EFMoutput_format(outputdir, datatype, software, tmp_path): @@ -256,7 +256,7 @@ def test_flags(tmp_path): [ ("RU_stutter_test/", "ce", "efm"), ("LUSPlus_stutter_test/", "lusplus", "efm"), - ("MPSProto_test/", "ngs", "mpsproto"), + ("MPSproto_test/", "ngs", "mpsproto"), ], ) def test_efm_reference(outputdir, datatype, software, tmp_path): @@ -404,7 +404,8 @@ def test_lusplus_sequence_info(tmp_path): "forward", "--str-type", "lusplus", - "--efm", + "--software", + "efm", ] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) shutil.copyfile(inputfile, os.path.join(str_path, "LUSPlus.csv")) From 60ab0e5ab8fd1ff0ce2412a6b26b7511d8db88db Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 6 Dec 2023 10:45:24 -0500 Subject: [PATCH 6/6] fixed manifest and setup.py --- MANIFEST.in | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 87da302c..acbf06e2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -12,4 +12,4 @@ include lusSTR/tests/data/NGS_stutter_test/* include lusSTR/tests/data/kinsnps/* include lusSTR/tests/data/lusstr_output/* include lusSTR/tests/data/LUSPlus_stutter_test/* -include lusSTR/tests/data/MPSProto_test/* +include lusSTR/tests/data/MPSproto_test/* diff --git a/setup.py b/setup.py index 43c6f4da..58ea49d0 100755 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ "lusSTR/tests/data/kinsnps/*", "lusSTR/tests/data/lusstr_output/*", "lusSTR/tests/data/LUSPlus_stutter_test/*", - "lusSTR/tests/data/MPSProto_test/*", + "lusSTR/tests/data/MPSproto_test/*", "lusSTR/workflows/*", "lusSTR/wrappers/*", ]