fix: Skip empty files for the sketch step

This commit is to fix few bugs: - #540 : The empty files are skip during the sketch step of Mash, they are then catch in the prodigal step and are returned as Unclassified - #549 : `--force` has been modified to deal with #540 - Prodigal wasn't returning the empty files as failed genomes, it was only skipping them. These genomes are now returned in the summary file and flagged as Unclassified.
Ecogenomics · Nov 23, 2023 · 5dc36e7 · 5dc36e7
1 parent db28f9c
commit 5dc36e7
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 50 deletions.
diff --git a/gtdbtk/ani_screen.py b/gtdbtk/ani_screen.py
@@ -3,6 +3,7 @@
 
 from gtdbtk.ani_rep import ANIRep, ANISummaryFile
 from gtdbtk.biolib_lite.common import make_sure_path_exists, canonical_gid
+from gtdbtk.biolib_lite.seq_io import read_fasta
 
 from gtdbtk.biolib_lite.taxonomy import Taxonomy
 from gtdbtk.classify import Classify
@@ -45,7 +46,25 @@ def run_aniscreen(self,genomes, no_mash,out_dir,prefix, mash_k, mash_v, mash_s,
 
         ani_rep = ANIRep(self.cpus)
         # we store all the mash information in the classify directory
-        fastani_results = ani_rep.run_mash_fastani(genomes, no_mash, mash_d, os.path.join(out_dir, DIR_ANISCREEN),
+
+        #we createa copy of the genomes dictionary to avoid modifying it
+        genomes_copy = genomes.copy()
+        # we remove all empty files from genomes.
+        for k in list(genomes_copy.keys()):
+            if os.path.getsize(genomes_copy[k]) == 0:
+                self.logger.warning(f'Genome {k} file is invalid for Mash. It will be removed from the sketch step.')
+                del genomes_copy[k]
+            else:
+                # we check if at least one contig is longer than the kmer size
+                seqs = read_fasta(genomes_copy[k])
+                if all(len(seq) < mash_k for seq in seqs.values()):
+                    self.logger.warning(
+                        f'Genome {k} file has all fasta sequences shorter than the k-mer size ({mash_k}).It will be removed from the sketch step.')
+                    del genomes_copy[k]
+
+
+
+        fastani_results = ani_rep.run_mash_fastani(genomes_copy, no_mash, mash_d, os.path.join(out_dir, DIR_ANISCREEN),
                                                     prefix, mash_k, mash_v, mash_s, mash_max_dist, mash_db)
 
         taxonomy = Taxonomy().read(CONFIG.TAXONOMY_FILE, canonical_ids=True)

diff --git a/gtdbtk/biolib_lite/prodigal_biolib.py b/gtdbtk/biolib_lite/prodigal_biolib.py
@@ -85,7 +85,9 @@ def _producer(self, genome_file):
             if len(seqs) == 0:
                 self.logger.warning('Cannot call Prodigal on an empty genome. '
                                     'Skipped: {}'.format(genome_file))
-                return None
+                return (
+                genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4],
+                table_coding_density[11], True)
 
             with tempfile.TemporaryDirectory('gtdbtk_prodigal_tmp_') as tmp_dir:
 
@@ -180,7 +182,7 @@ def _producer(self, genome_file):
                                              genome_id + '.gff'), gff_file)
 
         return (genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4],
-                table_coding_density[11])
+                table_coding_density[11],False)
 
     def _consumer(self, produced_data, consumer_data):
         """Consume results from producer processes.
@@ -205,18 +207,19 @@ def _consumer(self, produced_data, consumer_data):
 
         ConsumerData = namedtuple(
             'ConsumerData',
-            'aa_gene_file nt_gene_file gff_file best_translation_table coding_density_4 coding_density_11')
+            'aa_gene_file nt_gene_file gff_file best_translation_table coding_density_4 coding_density_11 is_empty')
         if consumer_data is None:
             consumer_data = defaultdict(ConsumerData)
 
-        genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, coding_density_4, coding_density_11 = produced_data
+        genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, coding_density_4, coding_density_11, is_empty = produced_data
 
         consumer_data[genome_id] = ConsumerData(aa_gene_file,
                                                 nt_gene_file,
                                                 gff_file,
                                                 best_translation_table,
                                                 coding_density_4,
-                                                coding_density_11)
+                                                coding_density_11,
+                                                is_empty)
 
         return consumer_data
 

diff --git a/gtdbtk/external/prodigal.py b/gtdbtk/external/prodigal.py
@@ -93,7 +93,7 @@ def _run_prodigal(self, genome_id, fasta_path, usr_tln_table):
         if all([file_has_checksum(x) for x in out_files]):
             tln_table_file.read()
             self.warnings.info(f'Skipped Prodigal processing for: {genome_id}')
-            return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, tln_table_file.best_tln_table, True
+            return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, tln_table_file.best_tln_table, True , False
 
         # Run Prodigal
         prodigal = BioLibProdigal(1, False)
@@ -110,25 +110,34 @@ def _run_prodigal(self, genome_id, fasta_path, usr_tln_table):
 
         summary_stats = summary_stats[list(summary_stats.keys())[0]]
 
-        # rename output files to adhere to GTDB conventions and desired genome
-        # ID
-        shutil.move(summary_stats.aa_gene_file, aa_gene_file)
-        shutil.move(summary_stats.nt_gene_file, nt_gene_file)
-        shutil.move(summary_stats.gff_file, gff_file)
+        if summary_stats.is_empty:
+            shutil.rmtree(output_dir)
+            if self.force:
+                return summary_stats
+            else:
+                raise Exception("An error was encountered while running Prodigal.")
 
-        # save translation table information
-        tln_table_file.best_tln_table = summary_stats.best_translation_table
-        tln_table_file.coding_density_4 = round(summary_stats.coding_density_4 * 100, 2)
-        tln_table_file.coding_density_11 = round(summary_stats.coding_density_11 * 100, 2)
-        tln_table_file.write()
+        else:
+            # rename output files to adhere to GTDB conventions and desired genome
+            # ID
+            shutil.move(summary_stats.aa_gene_file, aa_gene_file)
+            shutil.move(summary_stats.nt_gene_file, nt_gene_file)
+            shutil.move(summary_stats.gff_file, gff_file)
 
-        # Create a hash of each file
-        for out_file in out_files:
-            if out_file is not None:
-                with open(out_file + CHECKSUM_SUFFIX, 'w') as fh:
-                    fh.write(sha256(out_file))
+            # save translation table information
+            tln_table_file.best_tln_table = summary_stats.best_translation_table
+            tln_table_file.coding_density_4 = round(summary_stats.coding_density_4 * 100, 2)
+            tln_table_file.coding_density_11 = round(summary_stats.coding_density_11 * 100, 2)
+            tln_table_file.write()
 
-        return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, summary_stats.best_translation_table, False
+            # Create a hash of each file
+            for out_file in out_files:
+                if out_file is not None:
+                    with open(out_file + CHECKSUM_SUFFIX, 'w') as fh:
+                        fh.write(sha256(out_file))
+
+
+        return aa_gene_file, nt_gene_file, gff_file, tln_table_file.path, summary_stats.best_translation_table, False , summary_stats.is_empty
 
     def _worker(self, out_dict, worker_queue, writer_queue, n_skipped):
         """This worker function is invoked in a process."""
@@ -144,15 +153,16 @@ def _worker(self, out_dict, worker_queue, writer_queue, n_skipped):
 
             # Only proceed if an error didn't occur in BioLib Prodigal
             if rtn_files:
-                aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_translation_table, was_skipped = rtn_files
+                aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_translation_table, was_skipped ,is_empty = rtn_files
                 if was_skipped:
                     with n_skipped.get_lock():
                         n_skipped.value += 1
                 prodigal_infos = {"aa_gene_path": aa_gene_file,
                                   "nt_gene_path": nt_gene_file,
                                   "gff_path": gff_file,
                                   "translation_table_path": translation_table_file,
-                                  "best_translation_table": best_translation_table}
+                                  "best_translation_table": best_translation_table,
+                                  "is_empty": is_empty}
 
                 out_dict[genome_id] = prodigal_infos
             writer_queue.put(genome_id)
@@ -225,35 +235,46 @@ def run(self, genomic_files, tln_tables):
             self.logger.warning(f'Prodigal skipped {n_skipped.value} {genome_s} '
                                 f'due to pre-existing data, see warnings.log')
 
+
         # Report on any genomes which failed to have any genes called
         result_dict = dict()
         lq_gids = list()
+        empty_gids = list()
         fails = open(self.failed_genomes_file,'w')
         for gid, gid_dict in out_dict.items():
-            if os.path.getsize(gid_dict['aa_gene_path']) <= 1:
+            if gid_dict['is_empty']:
+                empty_gids.append(gid)
+            elif os.path.getsize(gid_dict['aa_gene_path']) <= 1:
                 lq_gids.append(gid)
             else:
                 result_dict[gid] = gid_dict
 
         if len(lq_gids) > 0:
-            self.logger.warning(f'Skipping {len(lq_gids)} of {len(genomic_files)} '
+            self.logger.warning(f'Skipping {len(lq_gids+empty_gids)} of {len(genomic_files)} '
                                 f'genomes as no genes were called by Prodigal. '
                                 f'Check the genome quality (see gtdbtk.warnings.log).')
-            self.warnings.warning(f'The following {len(lq_gids)} genomes have '
+            self.warnings.warning(f'The following {len(lq_gids+empty_gids)} genomes have '
                                   f'been excluded from analysis due to Prodigal '
                                   f'failing to call any genes:')
 
 
             # If there are few low-quality genomes just output to console.
-            if len(lq_gids) > 10:
+            if len(lq_gids+empty_gids) > 10:
                 for lq_gid in lq_gids:
                     self.warnings.info(lq_gid)
                     fails.write(f'{lq_gid}\tNo genes were called by Prodigal\n')
+                for empty_gid in empty_gids:
+                    self.warnings.info(empty_gid)
+                    fails.write(f'{empty_gid}\tEmpty file\n')
             else:
                 for lq_gid in lq_gids:
                     self.logger.warning(f'Skipping: {lq_gid}')
                     self.warnings.info(lq_gid)
                     fails.write(f'{lq_gid}\tNo genes were called by Prodigal\n')
+                for empty_gid in empty_gids:
+                    self.logger.warning(f'Skipping: {empty_gid}')
+                    self.warnings.info(empty_gid)
+                    fails.write(f'{empty_gid}\tEmpty file\n')
 
         fails.close()
         return result_dict
diff --git a/gtdbtk/main.py b/gtdbtk/main.py
@@ -653,13 +653,6 @@ def ani_screen(self, options ):
 
         make_sure_path_exists(options.out_dir)
 
-        genomes, tln_tables = self._genomes_to_process(options.genome_dir,
-                                                       options.batchfile,
-                                                       options.extension)
-        self.genomes_to_process = genomes
-
-        make_sure_path_exists(options.out_dir)
-
         genomes, _ = self._genomes_to_process(options.genome_dir,
                                               options.batchfile,
                                               options.extension)
@@ -1164,20 +1157,25 @@ def parse_options(self, options):
             if all_classified_ani:
                 self.logger.info('All genomes have been classified by the ANI screening step, Identify and Align steps will be skipped.')
             else:
-                self.identify(options,classified_genomes)
-                options.taxa_filter = None
-                options.custom_msa_filters = False
-                # Added here due to the other mutex argument being included above.
-                options.skip_trimming = False
-                options.min_consensus = None
-                options.min_perc_taxa = None
-                options.skip_gtdb_refs = False
-                options.cols_per_gene = None
-                options.max_consensus = None
-                options.rnd_seed = None
-                options.skip_trimming = False
-
-                self.align(options)
+                reports  = self.identify(options,classified_genomes)
+                if reports is None:
+                    self.logger.info(
+                        'No genomes could pass the Identify step, Align and Classify steps will be skipped.')
+                    all_classified_ani = True
+                else:
+                    options.taxa_filter = None
+                    options.custom_msa_filters = False
+                    # Added here due to the other mutex argument being included above.
+                    options.skip_trimming = False
+                    options.min_consensus = None
+                    options.min_perc_taxa = None
+                    options.skip_gtdb_refs = False
+                    options.cols_per_gene = None
+                    options.max_consensus = None
+                    options.rnd_seed = None
+                    options.skip_trimming = False
+
+                    self.align(options)
 
             # because we run ani_screen before the identify step, we do not need to rerun the
             #ani step again, so we set the skip_aniscreen to True

diff --git a/gtdbtk/markers.py b/gtdbtk/markers.py
@@ -221,6 +221,10 @@ def identify(self, genomes, tln_tables, out_dir, prefix, force, genes, write_sin
                 make_sure_path_exists(symlink_protein_dir)
                 symlink_f(os.path.abspath(gpath), os.path.join(symlink_protein_dir,gid+self.protein_file_suffix))
 
+        if not genome_dictionary:
+            # no genomes passed the prodigal step
+            return None
+
         # annotated genes against TIGRFAM and Pfam databases
         self.logger.log(CONFIG.LOG_TASK,
                         'Identifying TIGRFAM protein families.')