diff --git a/gtdbtk/ani_screen.py b/gtdbtk/ani_screen.py index 4b461a5..6431507 100644 --- a/gtdbtk/ani_screen.py +++ b/gtdbtk/ani_screen.py @@ -3,6 +3,7 @@ from gtdbtk.ani_rep import ANIRep, ANISummaryFile from gtdbtk.biolib_lite.common import make_sure_path_exists, canonical_gid +from gtdbtk.biolib_lite.seq_io import read_fasta from gtdbtk.biolib_lite.taxonomy import Taxonomy from gtdbtk.classify import Classify @@ -45,7 +46,25 @@ def run_aniscreen(self,genomes, no_mash,out_dir,prefix, mash_k, mash_v, mash_s, ani_rep = ANIRep(self.cpus) # we store all the mash information in the classify directory - fastani_results = ani_rep.run_mash_fastani(genomes, no_mash, mash_d, os.path.join(out_dir, DIR_ANISCREEN), + + #we createa copy of the genomes dictionary to avoid modifying it + genomes_copy = genomes.copy() + # we remove all empty files from genomes. + for k in list(genomes_copy.keys()): + if os.path.getsize(genomes_copy[k]) == 0: + self.logger.warning(f'Genome {k} file is invalid for Mash. It will be removed from the sketch step.') + del genomes_copy[k] + else: + # we check if at least one contig is longer than the kmer size + seqs = read_fasta(genomes_copy[k]) + if all(len(seq) < mash_k for seq in seqs.values()): + self.logger.warning( + f'Genome {k} file has all fasta sequences shorter than the k-mer size ({mash_k}).It will be removed from the sketch step.') + del genomes_copy[k] + + + + fastani_results = ani_rep.run_mash_fastani(genomes_copy, no_mash, mash_d, os.path.join(out_dir, DIR_ANISCREEN), prefix, mash_k, mash_v, mash_s, mash_max_dist, mash_db) taxonomy = Taxonomy().read(CONFIG.TAXONOMY_FILE, canonical_ids=True) diff --git a/gtdbtk/biolib_lite/prodigal_biolib.py b/gtdbtk/biolib_lite/prodigal_biolib.py index 527409e..8639b52 100644 --- a/gtdbtk/biolib_lite/prodigal_biolib.py +++ b/gtdbtk/biolib_lite/prodigal_biolib.py @@ -85,7 +85,9 @@ def _producer(self, genome_file): if len(seqs) == 0: self.logger.warning('Cannot call Prodigal on an empty genome. ' 'Skipped: {}'.format(genome_file)) - return None + return ( + genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4], + table_coding_density[11], True) with tempfile.TemporaryDirectory('gtdbtk_prodigal_tmp_') as tmp_dir: @@ -180,7 +182,7 @@ def _producer(self, genome_file): genome_id + '.gff'), gff_file) return (genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4], - table_coding_density[11]) + table_coding_density[11],False) def _consumer(self, produced_data, consumer_data): """Consume results from producer processes. @@ -205,18 +207,19 @@ def _consumer(self, produced_data, consumer_data): ConsumerData = namedtuple( 'ConsumerData', - 'aa_gene_file nt_gene_file gff_file best_translation_table coding_density_4 coding_density_11') + 'aa_gene_file nt_gene_file gff_file best_translation_table coding_density_4 coding_density_11 is_empty') if consumer_data is None: consumer_data = defaultdict(ConsumerData) - genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, coding_density_4, coding_density_11 = produced_data + genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, coding_density_4, coding_density_11, is_empty = produced_data consumer_data[genome_id] = ConsumerData(aa_gene_file, nt_gene_file, gff_file, best_translation_table, coding_density_4, - coding_density_11) + coding_density_11, + is_empty) return consumer_data diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py index 28d6c9a..d6410ab 100644 --- a/gtdbtk/external/prodigal.py +++ b/gtdbtk/external/prodigal.py @@ -93,7 +93,7 @@ def _run_prodigal(self, genome_id, fasta_path, usr_tln_table): if all([file_has_checksum(x) for x in out_files]): tln_table_file.read() self.warnings.info(f'Skipped Prodigal processing for: {genome_id}') - return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, tln_table_file.best_tln_table, True + return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, tln_table_file.best_tln_table, True , False # Run Prodigal prodigal = BioLibProdigal(1, False) @@ -110,25 +110,34 @@ def _run_prodigal(self, genome_id, fasta_path, usr_tln_table): summary_stats = summary_stats[list(summary_stats.keys())[0]] - # rename output files to adhere to GTDB conventions and desired genome - # ID - shutil.move(summary_stats.aa_gene_file, aa_gene_file) - shutil.move(summary_stats.nt_gene_file, nt_gene_file) - shutil.move(summary_stats.gff_file, gff_file) + if summary_stats.is_empty: + shutil.rmtree(output_dir) + if self.force: + return summary_stats + else: + raise Exception("An error was encountered while running Prodigal.") - # save translation table information - tln_table_file.best_tln_table = summary_stats.best_translation_table - tln_table_file.coding_density_4 = round(summary_stats.coding_density_4 * 100, 2) - tln_table_file.coding_density_11 = round(summary_stats.coding_density_11 * 100, 2) - tln_table_file.write() + else: + # rename output files to adhere to GTDB conventions and desired genome + # ID + shutil.move(summary_stats.aa_gene_file, aa_gene_file) + shutil.move(summary_stats.nt_gene_file, nt_gene_file) + shutil.move(summary_stats.gff_file, gff_file) - # Create a hash of each file - for out_file in out_files: - if out_file is not None: - with open(out_file + CHECKSUM_SUFFIX, 'w') as fh: - fh.write(sha256(out_file)) + # save translation table information + tln_table_file.best_tln_table = summary_stats.best_translation_table + tln_table_file.coding_density_4 = round(summary_stats.coding_density_4 * 100, 2) + tln_table_file.coding_density_11 = round(summary_stats.coding_density_11 * 100, 2) + tln_table_file.write() - return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, summary_stats.best_translation_table, False + # Create a hash of each file + for out_file in out_files: + if out_file is not None: + with open(out_file + CHECKSUM_SUFFIX, 'w') as fh: + fh.write(sha256(out_file)) + + + return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, summary_stats.best_translation_table, False , summary_stats.is_empty def _worker(self, out_dict, worker_queue, writer_queue, n_skipped): """This worker function is invoked in a process.""" @@ -144,7 +153,7 @@ def _worker(self, out_dict, worker_queue, writer_queue, n_skipped): # Only proceed if an error didn't occur in BioLib Prodigal if rtn_files: - aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_translation_table, was_skipped = rtn_files + aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_translation_table, was_skipped ,is_empty = rtn_files if was_skipped: with n_skipped.get_lock(): n_skipped.value += 1 @@ -152,7 +161,8 @@ def _worker(self, out_dict, worker_queue, writer_queue, n_skipped): "nt_gene_path": nt_gene_file, "gff_path": gff_file, "translation_table_path": translation_table_file, - "best_translation_table": best_translation_table} + "best_translation_table": best_translation_table, + "is_empty": is_empty} out_dict[genome_id] = prodigal_infos writer_queue.put(genome_id) @@ -225,35 +235,46 @@ def run(self, genomic_files, tln_tables): self.logger.warning(f'Prodigal skipped {n_skipped.value} {genome_s} ' f'due to pre-existing data, see warnings.log') + # Report on any genomes which failed to have any genes called result_dict = dict() lq_gids = list() + empty_gids = list() fails = open(self.failed_genomes_file,'w') for gid, gid_dict in out_dict.items(): - if os.path.getsize(gid_dict['aa_gene_path']) <= 1: + if gid_dict['is_empty']: + empty_gids.append(gid) + elif os.path.getsize(gid_dict['aa_gene_path']) <= 1: lq_gids.append(gid) else: result_dict[gid] = gid_dict if len(lq_gids) > 0: - self.logger.warning(f'Skipping {len(lq_gids)} of {len(genomic_files)} ' + self.logger.warning(f'Skipping {len(lq_gids+empty_gids)} of {len(genomic_files)} ' f'genomes as no genes were called by Prodigal. ' f'Check the genome quality (see gtdbtk.warnings.log).') - self.warnings.warning(f'The following {len(lq_gids)} genomes have ' + self.warnings.warning(f'The following {len(lq_gids+empty_gids)} genomes have ' f'been excluded from analysis due to Prodigal ' f'failing to call any genes:') # If there are few low-quality genomes just output to console. - if len(lq_gids) > 10: + if len(lq_gids+empty_gids) > 10: for lq_gid in lq_gids: self.warnings.info(lq_gid) fails.write(f'{lq_gid}\tNo genes were called by Prodigal\n') + for empty_gid in empty_gids: + self.warnings.info(empty_gid) + fails.write(f'{empty_gid}\tEmpty file\n') else: for lq_gid in lq_gids: self.logger.warning(f'Skipping: {lq_gid}') self.warnings.info(lq_gid) fails.write(f'{lq_gid}\tNo genes were called by Prodigal\n') + for empty_gid in empty_gids: + self.logger.warning(f'Skipping: {empty_gid}') + self.warnings.info(empty_gid) + fails.write(f'{empty_gid}\tEmpty file\n') fails.close() return result_dict diff --git a/gtdbtk/main.py b/gtdbtk/main.py index bf881dd..2b01b52 100644 --- a/gtdbtk/main.py +++ b/gtdbtk/main.py @@ -653,13 +653,6 @@ def ani_screen(self, options ): make_sure_path_exists(options.out_dir) - genomes, tln_tables = self._genomes_to_process(options.genome_dir, - options.batchfile, - options.extension) - self.genomes_to_process = genomes - - make_sure_path_exists(options.out_dir) - genomes, _ = self._genomes_to_process(options.genome_dir, options.batchfile, options.extension) @@ -1164,20 +1157,25 @@ def parse_options(self, options): if all_classified_ani: self.logger.info('All genomes have been classified by the ANI screening step, Identify and Align steps will be skipped.') else: - self.identify(options,classified_genomes) - options.taxa_filter = None - options.custom_msa_filters = False - # Added here due to the other mutex argument being included above. - options.skip_trimming = False - options.min_consensus = None - options.min_perc_taxa = None - options.skip_gtdb_refs = False - options.cols_per_gene = None - options.max_consensus = None - options.rnd_seed = None - options.skip_trimming = False - - self.align(options) + reports = self.identify(options,classified_genomes) + if reports is None: + self.logger.info( + 'No genomes could pass the Identify step, Align and Classify steps will be skipped.') + all_classified_ani = True + else: + options.taxa_filter = None + options.custom_msa_filters = False + # Added here due to the other mutex argument being included above. + options.skip_trimming = False + options.min_consensus = None + options.min_perc_taxa = None + options.skip_gtdb_refs = False + options.cols_per_gene = None + options.max_consensus = None + options.rnd_seed = None + options.skip_trimming = False + + self.align(options) # because we run ani_screen before the identify step, we do not need to rerun the #ani step again, so we set the skip_aniscreen to True diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py index fd9ad6c..910cd2b 100644 --- a/gtdbtk/markers.py +++ b/gtdbtk/markers.py @@ -221,6 +221,10 @@ def identify(self, genomes, tln_tables, out_dir, prefix, force, genes, write_sin make_sure_path_exists(symlink_protein_dir) symlink_f(os.path.abspath(gpath), os.path.join(symlink_protein_dir,gid+self.protein_file_suffix)) + if not genome_dictionary: + # no genomes passed the prodigal step + return None + # annotated genes against TIGRFAM and Pfam databases self.logger.log(CONFIG.LOG_TASK, 'Identifying TIGRFAM protein families.')