Skip to content

Commit

Permalink
Merge pull request #551 from Ecogenomics/ncbi_mv_script_update
Browse files Browse the repository at this point in the history
chore: updated script for translating GTDB-Tk classifications to NCBI…
  • Loading branch information
pchaumeil authored Oct 4, 2023
2 parents 7765d60 + 1ec72d6 commit db28f9c
Showing 1 changed file with 25 additions and 18 deletions.
43 changes: 25 additions & 18 deletions scripts/gtdb_to_ncbi_majority_vote.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
__copyright__ = 'Copyright 2019'
__credits__ = ['Donovan Parks']
__license__ = 'GPL3'
__version__ = '0.2.0'
__version__ = '0.2.1'
__maintainer__ = 'Donovan Parks'
__email__ = '[email protected]'
__status__ = 'Development'
Expand All @@ -52,7 +52,8 @@
FAMILY_IDX = 4
SPECIES_IDX = 6

class Translate(object):

class GtdbNcbiTranslate(object):
"""Translate GTDB to NCBI classification via majority vote."""

def __init__(self):
Expand Down Expand Up @@ -100,10 +101,10 @@ def get_gtdbtk_classifications(self,
gtdbtk_ar_assignments = self.parse_gtdbtk_classifications(
ar_summary)
else:
logger.warning(
f'Archaeal GTDB-Tk classification file does not exist.')
logger.warning(
f'Assuming there are no archaeal genomes to reclassify.')
self.logger.warning(
'Archaeal GTDB-Tk classification file does not exist.')
self.logger.warning(
'Assuming there are no archaeal genomes to reclassify.')

gtdbtk_bac_assignments = {}
if bac120_metadata_file:
Expand All @@ -114,10 +115,10 @@ def get_gtdbtk_classifications(self,
gtdbtk_bac_assignments = self.parse_gtdbtk_classifications(
bac_summary)
else:
logger.warning(
f'Bacterial GTDB-Tk classification file does not exist.')
logger.warning(
f'Assuming there are no bacterial genomes to reclassify.')
self.logger.warning(
'Bacterial GTDB-Tk classification file does not exist.')
self.logger.warning(
'Assuming there are no bacterial genomes to reclassify.')

return gtdbtk_ar_assignments, gtdbtk_bac_assignments

Expand Down Expand Up @@ -243,7 +244,8 @@ def parse_gtdb_metadata(self, ar53_metadata_file, bac120_metadata_file):
if gid == rep_id:
# genome is a GTDB representative
gtdb_taxonomy = tokens[gtdb_taxonomy_index]
gtdb_taxa = [t.strip() for t in gtdb_taxonomy.split(';')]
gtdb_taxa = [t.strip()
for t in gtdb_taxonomy.split(';')]
gtdb_family = gtdb_taxa[FAMILY_IDX]
gtdb_family_to_rids[gtdb_family].add(gid)

Expand Down Expand Up @@ -315,7 +317,7 @@ def resolve_majority_vote(self, taxon_counter, num_votes):
return None

self.logger.error('Unexpected case while resolving majority vote.')
assert(False)
assert False

def ncbi_sp_majority_vote(self, gtdb_sp_clusters, ncbi_taxa, ncbi_lineages):
"""Get NCBI majority vote classification for each GTDB species cluster."""
Expand Down Expand Up @@ -411,6 +413,11 @@ def ncbi_majority_vote(self,
# placed in species-level trees
processed_gids = set()
for tree_file in sp_trees:
if not os.path.exists(tree_file):
# can occur since all genomes might be classified
# using ANI prescreening
continue

self.logger.info(f' - parsing {tree_file}')
tree = dendropy.Tree.get_from_path(tree_file,
schema='newick',
Expand Down Expand Up @@ -513,9 +520,9 @@ def ncbi_majority_vote(self,
ncbi_mv = ncbi_sp_classification[gtdb_sp_rid]

fout.write('{}\t{}\t{}\n'.format(
gid,
';'.join(gtdb_taxa),
';'.join(ncbi_mv)))
gid,
';'.join(gtdb_taxa),
';'.join(ncbi_mv)))

def run(self,
gtdbtk_output_dir,
Expand Down Expand Up @@ -632,8 +639,8 @@ def print_help():
GTDB bacterial metadata file (if processing bacterial genomes).
NOTE: GTDB metadata files are available for download at:
https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tar.gz
https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tar.gz
https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz
https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz
{colour('Optional arguments:', ['underscore'])}
{colour('--gtdbtk_prefix', ['bright'])}
Expand Down Expand Up @@ -683,7 +690,7 @@ def print_help():
sys.exit(1)

try:
p = Translate()
p = GtdbNcbiTranslate()
p.run(args.gtdbtk_output_dir,
args.ar53_metadata_file,
args.bac120_metadata_file,
Expand Down

0 comments on commit db28f9c

Please sign in to comment.