diff --git a/CHANGELOG.md b/CHANGELOG.md index d4083f36..44628900 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# Version 0.9.1 + +* Fixed a bug that occured when parsing some plasmid FASTA record IDs. + # Version 0.9.0 * Updates to PointFinder database handling diff --git a/staramr/blast/results/plasmidfinder/PlasmidfinderHitHSP.py b/staramr/blast/results/plasmidfinder/PlasmidfinderHitHSP.py index c4d7da1f..78d4bc7a 100644 --- a/staramr/blast/results/plasmidfinder/PlasmidfinderHitHSP.py +++ b/staramr/blast/results/plasmidfinder/PlasmidfinderHitHSP.py @@ -27,7 +27,7 @@ def __init__(self, file: str, blast_record: pd.Series) -> None: logger.debug("record=%s", self._blast_record) - splitList = re.split('_', self.get_amr_gene_id()) + splitList = re.split(r'_\s*(?![^()]*\))', self.get_amr_gene_id()) re_search = list(filter(None, splitList)) # type: List[str] if not re_search: diff --git a/staramr/tests/integration/data/plasmid-underscore-brackets.fsa b/staramr/tests/integration/data/plasmid-underscore-brackets.fsa new file mode 100644 index 00000000..98aae656 --- /dev/null +++ b/staramr/tests/integration/data/plasmid-underscore-brackets.fsa @@ -0,0 +1,18 @@ +>rep21_24_rep(CN1_plasmid2)_NC_022227 +ATGCAATATAATACTACTAGAAGTATAACCGAAAATCAAGATAATAAAACGTTAAAAGAT +ATGACGAAAAGTGGGAAACAACGCCCATGGAGAGAAAAGAAAATAGATAATGTAAGCTAT +GCAGATATACTAGAAATTTTAAAAATCAAAAAGGCTTTTAATGTAAAACAATGTGGTAAT +ATTTTAGAATTTAAGCCAACTGATGAAGGTTATTTGAAGTTACATAAGACATGGTTTTGT +AAATCAAAATTATGTCCGGTTTGTAATTGGAGACGTGCTATGAAAAATAGTTATCAAGCT +CAAAAAGTGATTGAAGAAGTAATTAAGGAAAAGCCAAAAGCACGTTGGTTGTTTTTAACA +CTTTCAACAAAAAATGCGATAGATGGAGATACTTTAGAACAAAGTTTGAAGCATCTAACT +AAAGCATTTGATAGGTTGAGTAGATATAAAAAGGTTAAACAAAATCTTGTTGGATTTATG +CGTTCAACAGAAGTTACCGTTAATAAAAATGACGGTAGTTATAATCAGCATATGCACGTT +TTGTTATGTGTTGAAAATGCATATTTTAGAAAAAAAGAGAATTATATAACTCAAGAAGAA +TGGGTTAATTTATGGCAAAGAGCATTGCAAGTTGATTATCGACCTGTTGCTAATATTAAA +GCGATCAAACCAAATAAAAAAGGCGATAAAGATATTGAATCTGCAATCAAAGAGACCTCA +AAATATTCGGTTAAATCATCTGATTTTTTAACTGATGATGATGAAAAAAATCAAGAAATT +GTAAGTGATTTAGAAAAAGGTTTGTATCGAAAGCGTATGTTAAGTTATGGTGGATTGCTG +AAACAAAAGCATAAAATTTTAAACTTAGACGATGCTGAAGATGGAAATTTGATTAATACA +AGTGATGAAGATAAAACAACAGATGAAGAAGAAAAGGCACATTCGATTACGGCAATTTGG +AATTTTGAAAAGCAAAATTATTATTTAAGGCATTAG diff --git a/staramr/tests/integration/detection/test_AMRDetectionPlasmid.py b/staramr/tests/integration/detection/test_AMRDetectionPlasmid.py index ae818bca..745851cc 100644 --- a/staramr/tests/integration/detection/test_AMRDetectionPlasmid.py +++ b/staramr/tests/integration/detection/test_AMRDetectionPlasmid.py @@ -199,5 +199,23 @@ def testIndexRangePlasmids(self): msg='Wrong Predicted Phenotype value') self.assertEqual(summary_results['Plasmid'].iloc[0], 'IncFII(pKPX1)', msg='Wrong Plasmid Type') + def testParseUnderscoresBracketsInFASTA(self): + # Tests to ensure that the plasmid AMR detection can properly parse FASTA record IDs that have + # underscores within brackets. For example: "rep21_24_rep(CN1_plasmid2)_NC_022227". The "rep(CN1_plasmid2)" + # needs to be parsed as a single element. + + file = path.join(self.test_data_dir, "plasmid-underscore-brackets.fsa") + files = [file] + self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0) + + summary_results = self.amr_detection.get_summary_results() + + self.assertEqual(len(summary_results.index), 1, 'Wrong number of rows') + + self.assertEqual(summary_results['Genotype'].iloc[0], 'None', msg='Wrong Genotype value') + self.assertEqual(summary_results['Predicted Phenotype'].iloc[0], 'Sensitive', + msg='Wrong Predicted Phenotype value') + self.assertEqual(summary_results['Plasmid'].iloc[0], 'rep21', msg='Wrong Plasmid Type') + if __name__ == '__main__': unittest.main() diff --git a/staramr/tests/unit/blast/results/pointfinder/test_PlasmidfinderHitHSP.py b/staramr/tests/unit/blast/results/pointfinder/test_PlasmidfinderHitHSP.py index dc9330e4..dd5be0a2 100644 --- a/staramr/tests/unit/blast/results/pointfinder/test_PlasmidfinderHitHSP.py +++ b/staramr/tests/unit/blast/results/pointfinder/test_PlasmidfinderHitHSP.py @@ -64,3 +64,22 @@ def testParseSequenceId4(self): 'Did not parse correct gene name variant') self.assertEqual('IncFII(Serratia)_1_NC_009829', plasmid_hit_hsp.get_amr_gene_variant_accession(), 'Did not parse correct gene name variant accession') + + def testParseUnderscoresBrackets(self): + # Tests to ensure that PlasmidfinderHitHSP's init function can properly parse FASTA record IDs that have + # underscores within brackets. For example: "rep21_24_rep(CN1_plasmid2)_NC_022227". The "rep(CN1_plasmid2)" + # needs to be parsed as a single element. + + test_blast_record = {"sstart": 20, "send": 30, "sstrand": "ABC", "qstart": 1, "qend": 10, + 'qseqid': 'rep21_24_rep(CN1_plasmid2)_NC_022227'} + + plasmid_hit_hsp = PlasmidfinderHitHSP('test_file', test_blast_record) + + self.assertEqual('rep21', plasmid_hit_hsp.get_amr_gene_name(), 'Did not parse correct gene name') + self.assertEqual('24', plasmid_hit_hsp.get_amr_gene_variant(), 'Did not parse correct gene variant') + self.assertEqual('rep21_24', plasmid_hit_hsp.get_amr_gene_name_with_variant(), + 'Did not parse correct gene name variant') + self.assertEqual('NC_022227', plasmid_hit_hsp.get_amr_gene_accession(), + 'Did not parse correct gene name variant') + self.assertEqual('rep21_24_NC_022227', plasmid_hit_hsp.get_amr_gene_variant_accession(), + 'Did not parse correct gene name variant accession')