From 34727534fb29968a4c34a20513666506d99698a5 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Mon, 31 Oct 2016 13:09:09 +0000 Subject: [PATCH] dont filter out identical sequences by default --- CHANGELOG | 4 ++++ VERSION | 2 +- python/gubbins/PreProcessFasta.py | 9 ++++++-- python/gubbins/common.py | 2 +- .../tests/test_external_dependancies.py | 1 + .../gubbins/tests/test_pre_process_fasta.py | 22 +++++++++++++++---- python/scripts/run_gubbins.py | 1 + 7 files changed, 33 insertions(+), 8 deletions(-) mode change 100644 => 100755 python/gubbins/tests/test_external_dependancies.py diff --git a/CHANGELOG b/CHANGELOG index 223b3988..91401a5b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +v2.2.0 - 31 Oct 2016 +------ +By default dont filter out sequences which are 100% identical. + v2.1.0 - 22 July 2016 ------ Use GTRCAT model by default in RAxML instead of GTRGAMMA (massive speedup). diff --git a/VERSION b/VERSION index 7ec1d6db..ccbccc3d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.0 +2.2.0 diff --git a/python/gubbins/PreProcessFasta.py b/python/gubbins/PreProcessFasta.py index 1dbbc857..5b5f98be 100644 --- a/python/gubbins/PreProcessFasta.py +++ b/python/gubbins/PreProcessFasta.py @@ -71,8 +71,13 @@ def taxa_of_duplicate_sequences(self): return taxa_to_remove - def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename): - taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data() + def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename,remove_identical_sequences = 0): + + taxa_to_remove = [] + if remove_identical_sequences < 1: + taxa_to_remove = self.taxa_missing_too_much_data() + else: + taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data() with open(self.input_filename) as input_handle: with open(output_filename, "w+") as output_handle: diff --git a/python/gubbins/common.py b/python/gubbins/common.py index b1f2e4e7..c89fd7f5 100644 --- a/python/gubbins/common.py +++ b/python/gubbins/common.py @@ -151,7 +151,7 @@ def parse_and_run(self): temp_working_dir = tempfile.mkdtemp(dir=os.getcwd()) pre_process_fasta = PreProcessFasta(self.args.alignment_filename,self.args.verbose,self.args.filter_percentage) - taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename) + taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename, self.args.remove_identical_sequences) self.args.alignment_filename = temp_working_dir+"/"+starting_base_filename diff --git a/python/gubbins/tests/test_external_dependancies.py b/python/gubbins/tests/test_external_dependancies.py old mode 100644 new mode 100755 index 6066171e..6711d3ba --- a/python/gubbins/tests/test_external_dependancies.py +++ b/python/gubbins/tests/test_external_dependancies.py @@ -213,6 +213,7 @@ def base_arg_parse(self): parser.add_argument('--converge_method', '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]', default = 'weighted_robinson_foulds') parser.add_argument('--version', action='version', version=str(pkg_resources.get_distribution("gubbins").version)) parser.add_argument('--raxml_model', '-r', help='RAxML model [GTRGAMMA|GTRCAT], default GTRCAT', default = 'GTRCAT') + parser.add_argument('--remove_identical_sequences', '-d', action='count', help='Remove identical sequences', default = 0) return parser def default_arg_parse(self): diff --git a/python/gubbins/tests/test_pre_process_fasta.py b/python/gubbins/tests/test_pre_process_fasta.py index 10e5954f..07d5b8ac 100644 --- a/python/gubbins/tests/test_pre_process_fasta.py +++ b/python/gubbins/tests/test_pre_process_fasta.py @@ -28,7 +28,7 @@ def test_input_file_with_no_duplicate_sequences(self): self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),[]) - preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln') + preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1) self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/no_duplicates.aln')) def test_input_file_with_one_duplicate_sequences(self): @@ -40,7 +40,7 @@ def test_input_file_with_one_duplicate_sequences(self): self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1']) - preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln') + preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1) self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln')) def test_input_file_with_multiple_duplicate_sequences(self): @@ -51,8 +51,22 @@ def test_input_file_with_multiple_duplicate_sequences(self): self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2']) - preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln') + preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1) self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln')) + + + def test_dont_filter_input_file_with_multiple_duplicate_sequences(self): + preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln') + self.assertEqual(preprocessfasta.hash_sequences(), + {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'], + b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']}) + + self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2']) + + preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',0) + self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/multiple_duplicates.aln')) + + def test_input_file_with_all_duplicate_sequences(self): preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/all_same_sequence.aln') @@ -68,7 +82,7 @@ def test_input_file_with_all_duplicate_sequences(self): def test_filter_out_alignments_with_too_much_missing_data(self): preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5) - preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln') + preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1) self.assertTrue(filecmp.cmp('output.aln','gubbins/tests/data/preprocessfasta/expected_missing_data.aln')) def tearDown(self): diff --git a/python/scripts/run_gubbins.py b/python/scripts/run_gubbins.py index 35fc07c7..809f2446 100755 --- a/python/scripts/run_gubbins.py +++ b/python/scripts/run_gubbins.py @@ -46,6 +46,7 @@ parser.add_argument('--min_window_size', '-a', help='Minimum window size, default 100', type=int, default = 100) parser.add_argument('--max_window_size', '-b', help='Maximum window size, default 10000', type=int, default = 10000) parser.add_argument('--raxml_model', '-r', help='RAxML model [GTRGAMMA|GTRCAT], default GTRCAT', default = 'GTRCAT') +parser.add_argument('--remove_identical_sequences', '-d', action='count', help='Remove identical sequences', default = 0) gubbins_runner = common.GubbinsCommon(parser.parse_args()) gubbins_runner.parse_and_run()