Skip to content

Commit

Permalink
Merge pull request #159 from phac-nml/fix/parsing
Browse files Browse the repository at this point in the history
Ignore Underscores within Brackets of Plasmid FASTA Record ID
  • Loading branch information
apetkau authored Oct 26, 2022
2 parents af89027 + 28039b6 commit 6e8ded8
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Version 0.9.1

* Fixed a bug that occured when parsing some plasmid FASTA record IDs.

# Version 0.9.0

* Updates to PointFinder database handling
Expand Down
2 changes: 1 addition & 1 deletion staramr/blast/results/plasmidfinder/PlasmidfinderHitHSP.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, file: str, blast_record: pd.Series) -> None:

logger.debug("record=%s", self._blast_record)

splitList = re.split('_', self.get_amr_gene_id())
splitList = re.split(r'_\s*(?![^()]*\))', self.get_amr_gene_id())
re_search = list(filter(None, splitList)) # type: List[str]

if not re_search:
Expand Down
18 changes: 18 additions & 0 deletions staramr/tests/integration/data/plasmid-underscore-brackets.fsa
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
>rep21_24_rep(CN1_plasmid2)_NC_022227
ATGCAATATAATACTACTAGAAGTATAACCGAAAATCAAGATAATAAAACGTTAAAAGAT
ATGACGAAAAGTGGGAAACAACGCCCATGGAGAGAAAAGAAAATAGATAATGTAAGCTAT
GCAGATATACTAGAAATTTTAAAAATCAAAAAGGCTTTTAATGTAAAACAATGTGGTAAT
ATTTTAGAATTTAAGCCAACTGATGAAGGTTATTTGAAGTTACATAAGACATGGTTTTGT
AAATCAAAATTATGTCCGGTTTGTAATTGGAGACGTGCTATGAAAAATAGTTATCAAGCT
CAAAAAGTGATTGAAGAAGTAATTAAGGAAAAGCCAAAAGCACGTTGGTTGTTTTTAACA
CTTTCAACAAAAAATGCGATAGATGGAGATACTTTAGAACAAAGTTTGAAGCATCTAACT
AAAGCATTTGATAGGTTGAGTAGATATAAAAAGGTTAAACAAAATCTTGTTGGATTTATG
CGTTCAACAGAAGTTACCGTTAATAAAAATGACGGTAGTTATAATCAGCATATGCACGTT
TTGTTATGTGTTGAAAATGCATATTTTAGAAAAAAAGAGAATTATATAACTCAAGAAGAA
TGGGTTAATTTATGGCAAAGAGCATTGCAAGTTGATTATCGACCTGTTGCTAATATTAAA
GCGATCAAACCAAATAAAAAAGGCGATAAAGATATTGAATCTGCAATCAAAGAGACCTCA
AAATATTCGGTTAAATCATCTGATTTTTTAACTGATGATGATGAAAAAAATCAAGAAATT
GTAAGTGATTTAGAAAAAGGTTTGTATCGAAAGCGTATGTTAAGTTATGGTGGATTGCTG
AAACAAAAGCATAAAATTTTAAACTTAGACGATGCTGAAGATGGAAATTTGATTAATACA
AGTGATGAAGATAAAACAACAGATGAAGAAGAAAAGGCACATTCGATTACGGCAATTTGG
AATTTTGAAAAGCAAAATTATTATTTAAGGCATTAG
18 changes: 18 additions & 0 deletions staramr/tests/integration/detection/test_AMRDetectionPlasmid.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,5 +199,23 @@ def testIndexRangePlasmids(self):
msg='Wrong Predicted Phenotype value')
self.assertEqual(summary_results['Plasmid'].iloc[0], 'IncFII(pKPX1)', msg='Wrong Plasmid Type')

def testParseUnderscoresBracketsInFASTA(self):
# Tests to ensure that the plasmid AMR detection can properly parse FASTA record IDs that have
# underscores within brackets. For example: "rep21_24_rep(CN1_plasmid2)_NC_022227". The "rep(CN1_plasmid2)"
# needs to be parsed as a single element.

file = path.join(self.test_data_dir, "plasmid-underscore-brackets.fsa")
files = [file]
self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

summary_results = self.amr_detection.get_summary_results()

self.assertEqual(len(summary_results.index), 1, 'Wrong number of rows')

self.assertEqual(summary_results['Genotype'].iloc[0], 'None', msg='Wrong Genotype value')
self.assertEqual(summary_results['Predicted Phenotype'].iloc[0], 'Sensitive',
msg='Wrong Predicted Phenotype value')
self.assertEqual(summary_results['Plasmid'].iloc[0], 'rep21', msg='Wrong Plasmid Type')

if __name__ == '__main__':
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,22 @@ def testParseSequenceId4(self):
'Did not parse correct gene name variant')
self.assertEqual('IncFII(Serratia)_1_NC_009829', plasmid_hit_hsp.get_amr_gene_variant_accession(),
'Did not parse correct gene name variant accession')

def testParseUnderscoresBrackets(self):
# Tests to ensure that PlasmidfinderHitHSP's init function can properly parse FASTA record IDs that have
# underscores within brackets. For example: "rep21_24_rep(CN1_plasmid2)_NC_022227". The "rep(CN1_plasmid2)"
# needs to be parsed as a single element.

test_blast_record = {"sstart": 20, "send": 30, "sstrand": "ABC", "qstart": 1, "qend": 10,
'qseqid': 'rep21_24_rep(CN1_plasmid2)_NC_022227'}

plasmid_hit_hsp = PlasmidfinderHitHSP('test_file', test_blast_record)

self.assertEqual('rep21', plasmid_hit_hsp.get_amr_gene_name(), 'Did not parse correct gene name')
self.assertEqual('24', plasmid_hit_hsp.get_amr_gene_variant(), 'Did not parse correct gene variant')
self.assertEqual('rep21_24', plasmid_hit_hsp.get_amr_gene_name_with_variant(),
'Did not parse correct gene name variant')
self.assertEqual('NC_022227', plasmid_hit_hsp.get_amr_gene_accession(),
'Did not parse correct gene name variant')
self.assertEqual('rep21_24_NC_022227', plasmid_hit_hsp.get_amr_gene_variant_accession(),
'Did not parse correct gene name variant accession')

0 comments on commit 6e8ded8

Please sign in to comment.