Skip to content

Commit

Permalink
fix: Skip empty files for the sketch step
Browse files Browse the repository at this point in the history
This commit is to fix few bugs:
- #540 : The empty files are skip during the sketch step of Mash,
they are then catch in the prodigal step and are returned as Unclassified
- #549 : `--force` has been modified to deal with #540
- Prodigal wasn't returning the empty files as failed genomes, it was only skipping them.
These genomes are now returned in the summary file and flagged as Unclassified.
  • Loading branch information
pchaumeil committed Nov 23, 2023
1 parent db28f9c commit 5dc36e7
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 50 deletions.
21 changes: 20 additions & 1 deletion gtdbtk/ani_screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from gtdbtk.ani_rep import ANIRep, ANISummaryFile
from gtdbtk.biolib_lite.common import make_sure_path_exists, canonical_gid
from gtdbtk.biolib_lite.seq_io import read_fasta

from gtdbtk.biolib_lite.taxonomy import Taxonomy
from gtdbtk.classify import Classify
Expand Down Expand Up @@ -45,7 +46,25 @@ def run_aniscreen(self,genomes, no_mash,out_dir,prefix, mash_k, mash_v, mash_s,

ani_rep = ANIRep(self.cpus)
# we store all the mash information in the classify directory
fastani_results = ani_rep.run_mash_fastani(genomes, no_mash, mash_d, os.path.join(out_dir, DIR_ANISCREEN),

#we createa copy of the genomes dictionary to avoid modifying it
genomes_copy = genomes.copy()
# we remove all empty files from genomes.
for k in list(genomes_copy.keys()):
if os.path.getsize(genomes_copy[k]) == 0:
self.logger.warning(f'Genome {k} file is invalid for Mash. It will be removed from the sketch step.')
del genomes_copy[k]
else:
# we check if at least one contig is longer than the kmer size
seqs = read_fasta(genomes_copy[k])
if all(len(seq) < mash_k for seq in seqs.values()):
self.logger.warning(
f'Genome {k} file has all fasta sequences shorter than the k-mer size ({mash_k}).It will be removed from the sketch step.')
del genomes_copy[k]



fastani_results = ani_rep.run_mash_fastani(genomes_copy, no_mash, mash_d, os.path.join(out_dir, DIR_ANISCREEN),
prefix, mash_k, mash_v, mash_s, mash_max_dist, mash_db)

taxonomy = Taxonomy().read(CONFIG.TAXONOMY_FILE, canonical_ids=True)
Expand Down
13 changes: 8 additions & 5 deletions gtdbtk/biolib_lite/prodigal_biolib.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@ def _producer(self, genome_file):
if len(seqs) == 0:
self.logger.warning('Cannot call Prodigal on an empty genome. '
'Skipped: {}'.format(genome_file))
return None
return (
genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4],
table_coding_density[11], True)

with tempfile.TemporaryDirectory('gtdbtk_prodigal_tmp_') as tmp_dir:

Expand Down Expand Up @@ -180,7 +182,7 @@ def _producer(self, genome_file):
genome_id + '.gff'), gff_file)

return (genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4],
table_coding_density[11])
table_coding_density[11],False)

def _consumer(self, produced_data, consumer_data):
"""Consume results from producer processes.
Expand All @@ -205,18 +207,19 @@ def _consumer(self, produced_data, consumer_data):

ConsumerData = namedtuple(
'ConsumerData',
'aa_gene_file nt_gene_file gff_file best_translation_table coding_density_4 coding_density_11')
'aa_gene_file nt_gene_file gff_file best_translation_table coding_density_4 coding_density_11 is_empty')
if consumer_data is None:
consumer_data = defaultdict(ConsumerData)

genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, coding_density_4, coding_density_11 = produced_data
genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, coding_density_4, coding_density_11, is_empty = produced_data

consumer_data[genome_id] = ConsumerData(aa_gene_file,
nt_gene_file,
gff_file,
best_translation_table,
coding_density_4,
coding_density_11)
coding_density_11,
is_empty)

return consumer_data

Expand Down
67 changes: 44 additions & 23 deletions gtdbtk/external/prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def _run_prodigal(self, genome_id, fasta_path, usr_tln_table):
if all([file_has_checksum(x) for x in out_files]):
tln_table_file.read()
self.warnings.info(f'Skipped Prodigal processing for: {genome_id}')
return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, tln_table_file.best_tln_table, True
return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, tln_table_file.best_tln_table, True , False

# Run Prodigal
prodigal = BioLibProdigal(1, False)
Expand All @@ -110,25 +110,34 @@ def _run_prodigal(self, genome_id, fasta_path, usr_tln_table):

summary_stats = summary_stats[list(summary_stats.keys())[0]]

# rename output files to adhere to GTDB conventions and desired genome
# ID
shutil.move(summary_stats.aa_gene_file, aa_gene_file)
shutil.move(summary_stats.nt_gene_file, nt_gene_file)
shutil.move(summary_stats.gff_file, gff_file)
if summary_stats.is_empty:
shutil.rmtree(output_dir)
if self.force:
return summary_stats
else:
raise Exception("An error was encountered while running Prodigal.")

# save translation table information
tln_table_file.best_tln_table = summary_stats.best_translation_table
tln_table_file.coding_density_4 = round(summary_stats.coding_density_4 * 100, 2)
tln_table_file.coding_density_11 = round(summary_stats.coding_density_11 * 100, 2)
tln_table_file.write()
else:
# rename output files to adhere to GTDB conventions and desired genome
# ID
shutil.move(summary_stats.aa_gene_file, aa_gene_file)
shutil.move(summary_stats.nt_gene_file, nt_gene_file)
shutil.move(summary_stats.gff_file, gff_file)

# Create a hash of each file
for out_file in out_files:
if out_file is not None:
with open(out_file + CHECKSUM_SUFFIX, 'w') as fh:
fh.write(sha256(out_file))
# save translation table information
tln_table_file.best_tln_table = summary_stats.best_translation_table
tln_table_file.coding_density_4 = round(summary_stats.coding_density_4 * 100, 2)
tln_table_file.coding_density_11 = round(summary_stats.coding_density_11 * 100, 2)
tln_table_file.write()

return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, summary_stats.best_translation_table, False
# Create a hash of each file
for out_file in out_files:
if out_file is not None:
with open(out_file + CHECKSUM_SUFFIX, 'w') as fh:
fh.write(sha256(out_file))


return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, summary_stats.best_translation_table, False , summary_stats.is_empty

def _worker(self, out_dict, worker_queue, writer_queue, n_skipped):
"""This worker function is invoked in a process."""
Expand All @@ -144,15 +153,16 @@ def _worker(self, out_dict, worker_queue, writer_queue, n_skipped):

# Only proceed if an error didn't occur in BioLib Prodigal
if rtn_files:
aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_translation_table, was_skipped = rtn_files
aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_translation_table, was_skipped ,is_empty = rtn_files
if was_skipped:
with n_skipped.get_lock():
n_skipped.value += 1
prodigal_infos = {"aa_gene_path": aa_gene_file,
"nt_gene_path": nt_gene_file,
"gff_path": gff_file,
"translation_table_path": translation_table_file,
"best_translation_table": best_translation_table}
"best_translation_table": best_translation_table,
"is_empty": is_empty}

out_dict[genome_id] = prodigal_infos
writer_queue.put(genome_id)
Expand Down Expand Up @@ -225,35 +235,46 @@ def run(self, genomic_files, tln_tables):
self.logger.warning(f'Prodigal skipped {n_skipped.value} {genome_s} '
f'due to pre-existing data, see warnings.log')


# Report on any genomes which failed to have any genes called
result_dict = dict()
lq_gids = list()
empty_gids = list()
fails = open(self.failed_genomes_file,'w')
for gid, gid_dict in out_dict.items():
if os.path.getsize(gid_dict['aa_gene_path']) <= 1:
if gid_dict['is_empty']:
empty_gids.append(gid)
elif os.path.getsize(gid_dict['aa_gene_path']) <= 1:
lq_gids.append(gid)
else:
result_dict[gid] = gid_dict

if len(lq_gids) > 0:
self.logger.warning(f'Skipping {len(lq_gids)} of {len(genomic_files)} '
self.logger.warning(f'Skipping {len(lq_gids+empty_gids)} of {len(genomic_files)} '
f'genomes as no genes were called by Prodigal. '
f'Check the genome quality (see gtdbtk.warnings.log).')
self.warnings.warning(f'The following {len(lq_gids)} genomes have '
self.warnings.warning(f'The following {len(lq_gids+empty_gids)} genomes have '
f'been excluded from analysis due to Prodigal '
f'failing to call any genes:')


# If there are few low-quality genomes just output to console.
if len(lq_gids) > 10:
if len(lq_gids+empty_gids) > 10:
for lq_gid in lq_gids:
self.warnings.info(lq_gid)
fails.write(f'{lq_gid}\tNo genes were called by Prodigal\n')
for empty_gid in empty_gids:
self.warnings.info(empty_gid)
fails.write(f'{empty_gid}\tEmpty file\n')
else:
for lq_gid in lq_gids:
self.logger.warning(f'Skipping: {lq_gid}')
self.warnings.info(lq_gid)
fails.write(f'{lq_gid}\tNo genes were called by Prodigal\n')
for empty_gid in empty_gids:
self.logger.warning(f'Skipping: {empty_gid}')
self.warnings.info(empty_gid)
fails.write(f'{empty_gid}\tEmpty file\n')

fails.close()
return result_dict
40 changes: 19 additions & 21 deletions gtdbtk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,13 +653,6 @@ def ani_screen(self, options ):

make_sure_path_exists(options.out_dir)

genomes, tln_tables = self._genomes_to_process(options.genome_dir,
options.batchfile,
options.extension)
self.genomes_to_process = genomes

make_sure_path_exists(options.out_dir)

genomes, _ = self._genomes_to_process(options.genome_dir,
options.batchfile,
options.extension)
Expand Down Expand Up @@ -1164,20 +1157,25 @@ def parse_options(self, options):
if all_classified_ani:
self.logger.info('All genomes have been classified by the ANI screening step, Identify and Align steps will be skipped.')
else:
self.identify(options,classified_genomes)
options.taxa_filter = None
options.custom_msa_filters = False
# Added here due to the other mutex argument being included above.
options.skip_trimming = False
options.min_consensus = None
options.min_perc_taxa = None
options.skip_gtdb_refs = False
options.cols_per_gene = None
options.max_consensus = None
options.rnd_seed = None
options.skip_trimming = False

self.align(options)
reports = self.identify(options,classified_genomes)
if reports is None:
self.logger.info(
'No genomes could pass the Identify step, Align and Classify steps will be skipped.')
all_classified_ani = True
else:
options.taxa_filter = None
options.custom_msa_filters = False
# Added here due to the other mutex argument being included above.
options.skip_trimming = False
options.min_consensus = None
options.min_perc_taxa = None
options.skip_gtdb_refs = False
options.cols_per_gene = None
options.max_consensus = None
options.rnd_seed = None
options.skip_trimming = False

self.align(options)

# because we run ani_screen before the identify step, we do not need to rerun the
#ani step again, so we set the skip_aniscreen to True
Expand Down
4 changes: 4 additions & 0 deletions gtdbtk/markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ def identify(self, genomes, tln_tables, out_dir, prefix, force, genes, write_sin
make_sure_path_exists(symlink_protein_dir)
symlink_f(os.path.abspath(gpath), os.path.join(symlink_protein_dir,gid+self.protein_file_suffix))

if not genome_dictionary:
# no genomes passed the prodigal step
return None

# annotated genes against TIGRFAM and Pfam databases
self.logger.log(CONFIG.LOG_TASK,
'Identifying TIGRFAM protein families.')
Expand Down

0 comments on commit 5dc36e7

Please sign in to comment.