From 34727534fb29968a4c34a20513666506d99698a5 Mon Sep 17 00:00:00 2001
From: andrewjpage <andrewjpage@gmail.com>
Date: Mon, 31 Oct 2016 13:09:09 +0000
Subject: [PATCH] dont filter out identical sequences by default

---
 CHANGELOG                                     |  4 ++++
 VERSION                                       |  2 +-
 python/gubbins/PreProcessFasta.py             |  9 ++++++--
 python/gubbins/common.py                      |  2 +-
 .../tests/test_external_dependancies.py       |  1 +
 .../gubbins/tests/test_pre_process_fasta.py   | 22 +++++++++++++++----
 python/scripts/run_gubbins.py                 |  1 +
 7 files changed, 33 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 python/gubbins/tests/test_external_dependancies.py

diff --git a/CHANGELOG b/CHANGELOG
index 223b3988..91401a5b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,7 @@
+v2.2.0 - 31 Oct 2016
+------
+By default dont filter out sequences which are 100% identical.
+
 v2.1.0 - 22 July 2016
 ------
 Use GTRCAT model by default in RAxML instead of GTRGAMMA (massive speedup).
diff --git a/VERSION b/VERSION
index 7ec1d6db..ccbccc3d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.1.0
+2.2.0
diff --git a/python/gubbins/PreProcessFasta.py b/python/gubbins/PreProcessFasta.py
index 1dbbc857..5b5f98be 100644
--- a/python/gubbins/PreProcessFasta.py
+++ b/python/gubbins/PreProcessFasta.py
@@ -71,8 +71,13 @@ def taxa_of_duplicate_sequences(self):
      
      return taxa_to_remove
 
-  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename):
-      taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
+  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename,remove_identical_sequences = 0):
+	  
+      taxa_to_remove = []
+      if remove_identical_sequences < 1:	  
+          taxa_to_remove = self.taxa_missing_too_much_data()
+      else:
+          taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
       
       with open(self.input_filename) as input_handle:
           with open(output_filename, "w+") as output_handle:
diff --git a/python/gubbins/common.py b/python/gubbins/common.py
index b1f2e4e7..c89fd7f5 100644
--- a/python/gubbins/common.py
+++ b/python/gubbins/common.py
@@ -151,7 +151,7 @@ def parse_and_run(self):
     temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())
     
     pre_process_fasta = PreProcessFasta(self.args.alignment_filename,self.args.verbose,self.args.filter_percentage)
-    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename)
+    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename, self.args.remove_identical_sequences)
     
     self.args.alignment_filename = temp_working_dir+"/"+starting_base_filename
 
diff --git a/python/gubbins/tests/test_external_dependancies.py b/python/gubbins/tests/test_external_dependancies.py
old mode 100644
new mode 100755
index 6066171e..6711d3ba
--- a/python/gubbins/tests/test_external_dependancies.py
+++ b/python/gubbins/tests/test_external_dependancies.py
@@ -213,6 +213,7 @@ def base_arg_parse(self):
       parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
       parser.add_argument('--version',                action='version', version=str(pkg_resources.get_distribution("gubbins").version))
       parser.add_argument('--raxml_model',      '-r', help='RAxML model [GTRGAMMA|GTRCAT], default GTRCAT',  default = 'GTRCAT')
+      parser.add_argument('--remove_identical_sequences', '-d', action='count', help='Remove identical sequences', default = 0)
       return parser
       
   def default_arg_parse(self):
diff --git a/python/gubbins/tests/test_pre_process_fasta.py b/python/gubbins/tests/test_pre_process_fasta.py
index 10e5954f..07d5b8ac 100644
--- a/python/gubbins/tests/test_pre_process_fasta.py
+++ b/python/gubbins/tests/test_pre_process_fasta.py
@@ -28,7 +28,7 @@ def test_input_file_with_no_duplicate_sequences(self):
         
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),[])
 
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/no_duplicates.aln'))
 
   def test_input_file_with_one_duplicate_sequences(self):   
@@ -40,7 +40,7 @@ def test_input_file_with_one_duplicate_sequences(self):
         
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1'])
       
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln'))
  
   def test_input_file_with_multiple_duplicate_sequences(self):   
@@ -51,8 +51,22 @@ def test_input_file_with_multiple_duplicate_sequences(self):
         
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
       
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln'))
+      
+      
+  def test_dont_filter_input_file_with_multiple_duplicate_sequences(self):   
+      preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln')
+      self.assertEqual(preprocessfasta.hash_sequences(), 
+       {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'],
+        b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']})
+        
+      self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
+      
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',0)
+      self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/multiple_duplicates.aln'))    
+      
+      
  
   def test_input_file_with_all_duplicate_sequences(self):   
       preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/all_same_sequence.aln')
@@ -68,7 +82,7 @@ def test_input_file_with_all_duplicate_sequences(self):
                                                       
   def test_filter_out_alignments_with_too_much_missing_data(self):
     preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5)
-    preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+    preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
     self.assertTrue(filecmp.cmp('output.aln','gubbins/tests/data/preprocessfasta/expected_missing_data.aln'))        
       
   def tearDown(self):
diff --git a/python/scripts/run_gubbins.py b/python/scripts/run_gubbins.py
index 35fc07c7..809f2446 100755
--- a/python/scripts/run_gubbins.py
+++ b/python/scripts/run_gubbins.py
@@ -46,6 +46,7 @@
 parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
 parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
 parser.add_argument('--raxml_model',      '-r', help='RAxML model [GTRGAMMA|GTRCAT], default GTRCAT',  default = 'GTRCAT')
+parser.add_argument('--remove_identical_sequences', '-d', action='count', help='Remove identical sequences', default = 0)
 
 gubbins_runner  = common.GubbinsCommon(parser.parse_args())
 gubbins_runner.parse_and_run()