Merge pull request #159 from phac-nml/fix/parsing

Ignore Underscores within Brackets of Plasmid FASTA Record ID
phac-nml · Oct 26, 2022 · 6e8ded8 · 6e8ded8
2 parents af89027 + 28039b6
commit 6e8ded8
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# Version 0.9.1
+
+* Fixed a bug that occured when parsing some plasmid FASTA record IDs.
+
 # Version 0.9.0
 
 * Updates to PointFinder database handling

diff --git a/staramr/blast/results/plasmidfinder/PlasmidfinderHitHSP.py b/staramr/blast/results/plasmidfinder/PlasmidfinderHitHSP.py
@@ -27,7 +27,7 @@ def __init__(self, file: str, blast_record: pd.Series) -> None:
 
         logger.debug("record=%s", self._blast_record)
 
-        splitList = re.split('_', self.get_amr_gene_id())
+        splitList = re.split(r'_\s*(?![^()]*\))', self.get_amr_gene_id())
         re_search = list(filter(None, splitList))  # type: List[str]
 
         if not re_search:

diff --git a/staramr/tests/integration/data/plasmid-underscore-brackets.fsa b/staramr/tests/integration/data/plasmid-underscore-brackets.fsa
@@ -0,0 +1,18 @@
+>rep21_24_rep(CN1_plasmid2)_NC_022227
+ATGCAATATAATACTACTAGAAGTATAACCGAAAATCAAGATAATAAAACGTTAAAAGAT
+ATGACGAAAAGTGGGAAACAACGCCCATGGAGAGAAAAGAAAATAGATAATGTAAGCTAT
+GCAGATATACTAGAAATTTTAAAAATCAAAAAGGCTTTTAATGTAAAACAATGTGGTAAT
+ATTTTAGAATTTAAGCCAACTGATGAAGGTTATTTGAAGTTACATAAGACATGGTTTTGT
+AAATCAAAATTATGTCCGGTTTGTAATTGGAGACGTGCTATGAAAAATAGTTATCAAGCT
+CAAAAAGTGATTGAAGAAGTAATTAAGGAAAAGCCAAAAGCACGTTGGTTGTTTTTAACA
+CTTTCAACAAAAAATGCGATAGATGGAGATACTTTAGAACAAAGTTTGAAGCATCTAACT
+AAAGCATTTGATAGGTTGAGTAGATATAAAAAGGTTAAACAAAATCTTGTTGGATTTATG
+CGTTCAACAGAAGTTACCGTTAATAAAAATGACGGTAGTTATAATCAGCATATGCACGTT
+TTGTTATGTGTTGAAAATGCATATTTTAGAAAAAAAGAGAATTATATAACTCAAGAAGAA
+TGGGTTAATTTATGGCAAAGAGCATTGCAAGTTGATTATCGACCTGTTGCTAATATTAAA
+GCGATCAAACCAAATAAAAAAGGCGATAAAGATATTGAATCTGCAATCAAAGAGACCTCA
+AAATATTCGGTTAAATCATCTGATTTTTTAACTGATGATGATGAAAAAAATCAAGAAATT
+GTAAGTGATTTAGAAAAAGGTTTGTATCGAAAGCGTATGTTAAGTTATGGTGGATTGCTG
+AAACAAAAGCATAAAATTTTAAACTTAGACGATGCTGAAGATGGAAATTTGATTAATACA
+AGTGATGAAGATAAAACAACAGATGAAGAAGAAAAGGCACATTCGATTACGGCAATTTGG
+AATTTTGAAAAGCAAAATTATTATTTAAGGCATTAG
diff --git a/staramr/tests/integration/detection/test_AMRDetectionPlasmid.py b/staramr/tests/integration/detection/test_AMRDetectionPlasmid.py
@@ -199,5 +199,23 @@ def testIndexRangePlasmids(self):
                          msg='Wrong Predicted Phenotype value')
         self.assertEqual(summary_results['Plasmid'].iloc[0], 'IncFII(pKPX1)', msg='Wrong Plasmid Type')
 
+    def testParseUnderscoresBracketsInFASTA(self):
+        # Tests to ensure that the plasmid AMR detection can properly parse FASTA record IDs that have
+        # underscores within brackets. For example: "rep21_24_rep(CN1_plasmid2)_NC_022227". The "rep(CN1_plasmid2)"
+        # needs to be parsed as a single element.
+
+        file = path.join(self.test_data_dir, "plasmid-underscore-brackets.fsa")
+        files = [file]
+        self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)
+
+        summary_results = self.amr_detection.get_summary_results()
+
+        self.assertEqual(len(summary_results.index), 1, 'Wrong number of rows')
+
+        self.assertEqual(summary_results['Genotype'].iloc[0], 'None', msg='Wrong Genotype value')
+        self.assertEqual(summary_results['Predicted Phenotype'].iloc[0], 'Sensitive',
+                         msg='Wrong Predicted Phenotype value')
+        self.assertEqual(summary_results['Plasmid'].iloc[0], 'rep21', msg='Wrong Plasmid Type')
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/staramr/tests/unit/blast/results/pointfinder/test_PlasmidfinderHitHSP.py b/staramr/tests/unit/blast/results/pointfinder/test_PlasmidfinderHitHSP.py
@@ -64,3 +64,22 @@ def testParseSequenceId4(self):
                          'Did not parse correct gene name variant')
         self.assertEqual('IncFII(Serratia)_1_NC_009829', plasmid_hit_hsp.get_amr_gene_variant_accession(),
                          'Did not parse correct gene name variant accession')
+
+    def testParseUnderscoresBrackets(self):
+        # Tests to ensure that PlasmidfinderHitHSP's init function can properly parse FASTA record IDs that have
+        # underscores within brackets. For example: "rep21_24_rep(CN1_plasmid2)_NC_022227". The "rep(CN1_plasmid2)"
+        # needs to be parsed as a single element.
+
+        test_blast_record = {"sstart": 20, "send": 30, "sstrand": "ABC", "qstart": 1, "qend": 10,
+                             'qseqid': 'rep21_24_rep(CN1_plasmid2)_NC_022227'}
+
+        plasmid_hit_hsp = PlasmidfinderHitHSP('test_file', test_blast_record)
+
+        self.assertEqual('rep21', plasmid_hit_hsp.get_amr_gene_name(), 'Did not parse correct gene name')
+        self.assertEqual('24', plasmid_hit_hsp.get_amr_gene_variant(), 'Did not parse correct gene variant')
+        self.assertEqual('rep21_24', plasmid_hit_hsp.get_amr_gene_name_with_variant(),
+                         'Did not parse correct gene name variant')
+        self.assertEqual('NC_022227', plasmid_hit_hsp.get_amr_gene_accession(),
+                         'Did not parse correct gene name variant')
+        self.assertEqual('rep21_24_NC_022227', plasmid_hit_hsp.get_amr_gene_variant_accession(),
+                         'Did not parse correct gene name variant accession')