From 49581ae9ed47cc17362e5760bf6c06f60f7e1cea Mon Sep 17 00:00:00 2001 From: rhysnewell Date: Sun, 10 Mar 2024 23:41:15 +0000 Subject: [PATCH] fix: better catching of singlem error messages --- README.md | 2 +- aviary/modules/annotation/annotation.smk | 2 +- aviary/modules/binning/binning.smk | 2 ++ aviary/modules/processor.py | 2 ++ aviary/scripts/singlem_reads.py | 20 +++++++++++++++++--- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 184813cb..7f7c8daf 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ ask you to set these environment variables upon first running and if they are no the `aviary configure` subcommand to reset the environment variables: ```commandline -aviary configure -o logs/ --eggnog-db-path /shared/db/eggnog/ --gtdb-path /shared/db/gtdb/ --checkm2-db-path /shared/db/checkm2db/ --download +aviary configure -o logs/ --eggnog-db-path /shared/db/eggnog/ --gtdb-path /shared/db/gtdb/ --checkm2-db-path /shared/db/checkm2db/ --singlem-metapackage-path /shared/db/singlem/ --download ``` This command will check if the databases exist at those given locations, if they don't then aviary will download and change diff --git a/aviary/modules/annotation/annotation.smk b/aviary/modules/annotation/annotation.smk index 6b2ca96a..f7fdf185 100644 --- a/aviary/modules/annotation/annotation.smk +++ b/aviary/modules/annotation/annotation.smk @@ -99,7 +99,7 @@ rule download_gtdb: # Uncompress and pipe output to TQDM 'echo "[INFO] - Extracting archive..."; ' - 'tar xvzf "$TARGET_TAR" -C "${{TARGET_DIR}}" --strip 1; ' + 'tar -xvzf "$TARGET_TAR" -C "${{TARGET_DIR}}" --strip 1; ' # Remove the file after successful extraction 'rm "$TARGET_TAR"; ' diff --git a/aviary/modules/binning/binning.smk b/aviary/modules/binning/binning.smk index d9eaea76..9a28a708 100644 --- a/aviary/modules/binning/binning.smk +++ b/aviary/modules/binning/binning.smk @@ -732,6 +732,8 @@ rule checkm_das_tool: rule singlem_pipe_reads: output: "data/singlem_out/metagenome.combined_otu_table.csv" + params: + package_path = os.environ["SINGLEM_METAPACKAGE_PATH"] threads: min(config["max_threads"], 48) resources: mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 8*1024*attempt), diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index 93943dce..bba9dd66 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -150,6 +150,8 @@ def __init__(self, self.skip_binners = ["none"] self.skip_abundances = False self.binning_only = False + self.skip_taxonomy = False + self.skip_singlem = False try: self.assembly = args.assembly diff --git a/aviary/scripts/singlem_reads.py b/aviary/scripts/singlem_reads.py index f44a23cc..3b0b5978 100644 --- a/aviary/scripts/singlem_reads.py +++ b/aviary/scripts/singlem_reads.py @@ -137,13 +137,16 @@ def __init__(self, threads: int, output_dir: str, read_container: ReadContainer, def run(self): with open(self.logf, "a") as logf: - logf.write("generating SingleM commands") + logf.write("generating SingleM commands\n") self.create_commands() + for command in self.commands: + logf.write(" ".join(command) + "\n") + logf.write("running SingleM commands\n") self.run_commands(logf) self.combine_otu_tables(logf) def combine_otu_tables(self, logf): - logf.write("combining SingleM otu tables") + logf.write("combining SingleM otu tables\n") intermidate_otu_tables = glob.glob(os.path.join(self.intermediate_dir, "*.csv")) summarise_cmd = f"singlem summarise --input-otu-tables {' '.join(intermidate_otu_tables)} --output-otu-table {os.path.join(self.output_dir, 'metagenome.combined_otu_table.csv')}".split() try: @@ -160,16 +163,19 @@ def create_commands(self): self._create_longread_commands() self._create_shortread_commands() + def run_commands(self, logf): process_index = 0 for command in self.commands: f = tempfile.TemporaryFile() - p = subprocess.Popen(command, stdout=f, stderr=STDOUT) + p = subprocess.Popen(command, stdout=f, stderr=logf) self.process_queue.append((p, f)) process_index += 1 if len(self.process_queue) >= self.threads: self._check_processes(self.threads + 1, logf) + # write how many processes are left + logf.write(f"waiting for {len(self.process_queue)} processes to finish\n") while len(self.process_queue) > 0: self._check_processes(0, logf) @@ -215,8 +221,16 @@ def run_singlem( singlem_container = SingleMContainer(threads, output_dir, read_container, log) singlem_container.run() +def valid_path(path: str) -> bool: + return os.path.exists(path) if __name__ == '__main__': + # check if SINGLEM_METAPACKAGE_PATH environment variable is set and path is valid + # if not then, error and exit + os.environ["SINGLEM_METAPACKAGE_PATH"] = snakemake.params.package_path + if "SINGLEM_METAPACKAGE_PATH" not in os.environ or not valid_path(os.environ["SINGLEM_METAPACKAGE_PATH"]): + raise ValueError("SINGLEM_METAPACKAGE_PATH environment variable not set. Please set using 'aviary configure' or manually. Exiting.") + long_reads = snakemake.config['long_reads'] short_reads_1 = snakemake.config['short_reads_1'] short_reads_2 = snakemake.config['short_reads_2']