nickjcroucher · andrewjpage · Oct 31, 2016 · Oct 31, 2016
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,7 @@
+v2.2.0 - 31 Oct 2016
+------
+By default dont filter out sequences which are 100% identical.
+
 v2.1.0 - 22 July 2016
 ------
 Use GTRCAT model by default in RAxML instead of GTRGAMMA (massive speedup).

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.1.0
+2.2.0
diff --git a/python/gubbins/PreProcessFasta.py b/python/gubbins/PreProcessFasta.py
@@ -71,8 +71,13 @@ def taxa_of_duplicate_sequences(self):
 
      return taxa_to_remove
 
-  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename):
-      taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
+  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename,remove_identical_sequences = 0):
+
+      taxa_to_remove = []
+      if remove_identical_sequences < 1:	  
+          taxa_to_remove = self.taxa_missing_too_much_data()
+      else:
+          taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
 
       with open(self.input_filename) as input_handle:
           with open(output_filename, "w+") as output_handle:

diff --git a/python/gubbins/common.py b/python/gubbins/common.py
@@ -151,7 +151,7 @@ def parse_and_run(self):
     temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())
 
     pre_process_fasta = PreProcessFasta(self.args.alignment_filename,self.args.verbose,self.args.filter_percentage)
-    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename)
+    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename, self.args.remove_identical_sequences)
 
     self.args.alignment_filename = temp_working_dir+"/"+starting_base_filename
 

diff --git a/python/gubbins/tests/test_external_dependancies.py b/python/gubbins/tests/test_external_dependancies.py
@@ -213,6 +213,7 @@ def base_arg_parse(self):
       parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
       parser.add_argument('--version',                action='version', version=str(pkg_resources.get_distribution("gubbins").version))
       parser.add_argument('--raxml_model',      '-r', help='RAxML model [GTRGAMMA|GTRCAT], default GTRCAT',  default = 'GTRCAT')
+      parser.add_argument('--remove_identical_sequences', '-d', action='count', help='Remove identical sequences', default = 0)
       return parser
 
   def default_arg_parse(self):

diff --git a/python/gubbins/tests/test_pre_process_fasta.py b/python/gubbins/tests/test_pre_process_fasta.py
@@ -28,7 +28,7 @@ def test_input_file_with_no_duplicate_sequences(self):
 
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),[])
 
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/no_duplicates.aln'))
 
   def test_input_file_with_one_duplicate_sequences(self):   
@@ -40,7 +40,7 @@ def test_input_file_with_one_duplicate_sequences(self):
 
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1'])
 
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln'))
 
   def test_input_file_with_multiple_duplicate_sequences(self):   
@@ -51,8 +51,22 @@ def test_input_file_with_multiple_duplicate_sequences(self):
 
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
 
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln'))
+
+
+  def test_dont_filter_input_file_with_multiple_duplicate_sequences(self):   
+      preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln')
+      self.assertEqual(preprocessfasta.hash_sequences(), 
+       {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'],
+        b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']})
+
+      self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
+
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',0)
+      self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/multiple_duplicates.aln'))    
+
+
 
   def test_input_file_with_all_duplicate_sequences(self):   
       preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/all_same_sequence.aln')
@@ -68,7 +82,7 @@ def test_input_file_with_all_duplicate_sequences(self):
 
   def test_filter_out_alignments_with_too_much_missing_data(self):
     preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5)
-    preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+    preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
     self.assertTrue(filecmp.cmp('output.aln','gubbins/tests/data/preprocessfasta/expected_missing_data.aln'))        
 
   def tearDown(self):

diff --git a/python/scripts/run_gubbins.py b/python/scripts/run_gubbins.py
@@ -46,6 +46,7 @@
 parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
 parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
 parser.add_argument('--raxml_model',      '-r', help='RAxML model [GTRGAMMA|GTRCAT], default GTRCAT',  default = 'GTRCAT')
+parser.add_argument('--remove_identical_sequences', '-d', action='count', help='Remove identical sequences', default = 0)
 
 gubbins_runner  = common.GubbinsCommon(parser.parse_args())
 gubbins_runner.parse_and_run()