allow for unfiltered variants; bump copyright, minor tweaks

kdm9 · Jul 3, 2024 · f898be4 · f898be4
1 parent bf15a39
commit f898be4
Show file tree

Hide file tree

Showing 10 changed files with 113 additions and 63 deletions.
diff --git a/acanthophis/template/workflow/config.schema.yml b/acanthophis/template/workflow/config.schema.yml
@@ -32,10 +32,10 @@ properties:
         additionalProperties:
           type: object
           properties:
-            nodes:
-              type: string
-            fmi:
+            dir:
               type: string
+            bracken:
+              type: integer
             required:
               - dir
               - bracken

diff --git a/acanthophis/template/workflow/rules/align.rules b/acanthophis/template/workflow/rules/align.rules
@@ -3,7 +3,7 @@
 # please, if you find a bug, raise an issue on github so the fix gets shared
 # with everyone.
 #
-# Copyright 2016-2022 Kevin Murray/Gekkonid Consulting
+# Copyright 2016-2024 Kevin Murray/Gekkonid Consulting
 #
 # This Source Code Form is subject to the terms of the Mozilla Public License,
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
@@ -22,6 +22,7 @@ rule ngmap_idx:
         L("{path}-index.log"),
     resources: **rule_resources(config, "ngmap_idx", runtime=40, mem_gb=2, cores=1)
     conda: "envs/align.yml"
+    container: "docker://ghcr.io/kdm9/align:latest"
     shell:
         "( ngm"
         "   -r {input.ref}"
@@ -39,6 +40,7 @@ rule ngmap:
     resources: **rule_resources(config, "ngmap", runtime=240, mem_gb=16, cores=8)
     params:
         sensitivity=config["tool_settings"]["ngm"]["sensitivity"],
+    container: "docker://ghcr.io/kdm9/align:latest"
     conda: "envs/align.yml"
     shell:
         "( ngm"
@@ -66,6 +68,7 @@ rule bwaidx:
         R("{path}.pac"),
         R("{path}.sa"),
     conda: "envs/align.yml"
+    container: "docker://ghcr.io/kdm9/align:latest"
     log: L("{path}_index.log"),
     resources: **rule_resources(config, "bwaidx", runtime=20, mem_gb=8)
     shell:
@@ -85,6 +88,7 @@ rule bwamem:
         bam=temp(T("alignments/byrun.raw/bwa/{ref}/{run}~{lib}~{sample}.bam")),
     log: L("alignments/byrun.raw/bwa/{ref}/{run}~{lib}~{sample}.bam.log")
     resources: **rule_resources(config, "bwamem", runtime=240, mem_gb=10, cores=8)
+    container: "docker://ghcr.io/kdm9/align:latest"
     conda: "envs/align.yml"
     shell:
         "( bwa mem"
@@ -112,6 +116,7 @@ rule bam_merge_markdups_sort:
     resources: **rule_resources(config, "bam_merge_markdups_sort", runtime=240, mem_gb=16, disk_gb=50, cores=8)
     log: L("alignments/samples/{aligner}~{ref}~{sample}.bam.log")
     conda: "envs/align.yml"
+    container: "docker://ghcr.io/kdm9/align:latest"
     priority: 2
     params:
         ziplevel=config.get("tool_settings", {}).get('ziplevel', 6),
@@ -169,6 +174,7 @@ rule mergebam_set:
     log:
         L("alignments/sets/{aligner}~{ref}~{sampleset}.bam.log"),
     resources: **rule_resources(config, "mergebam_set", runtime=2880, mem_gb=16, disk_gb=1000, cores=64)
+    container: "docker://ghcr.io/kdm9/align:latest"
     conda: "envs/align.yml"
     params:
         ziplevel=config.get("tool_settings", {}).get('ziplevel', 6),
@@ -195,6 +201,7 @@ rule bamstat_sample:
         L("alignments/bamstats/sample/{aligner}~{ref}~{sample}.samtools.stats.log")
     resources: **rule_resources(config, "bamstat_sample", runtime=120, mem_gb=4, cores=1)
     conda: "envs/align.yml"
+    container: "docker://ghcr.io/kdm9/align:latest"
     shell:
         "(samtools stats -i 5000 -x {input} >{output}) >{log} 2>&1"
 
@@ -208,6 +215,7 @@ rule multiqc_samstats:
         log=L("stats/multiqc/bamstats_{aligner}~{ref}~{sampleset}_multiqc.log"),
     resources: **rule_resources(config, "multiqc_samstats", runtime=30, mem_gb=2, cores=1)
     conda: "envs/qcstats.yml"
+    container: "docker://multiqc/multiqc:v1.20"
     shell:
         "multiqc"
         "   --no-megaqc-upload"
@@ -251,6 +259,7 @@ rule multiqc_qualimap:
         log=L("stats/multiqc/qualimap_{aligner}~{ref}~{sampleset}_multiqc.log"),
     resources: **rule_resources(config, "multiqc_qualimap", runtime=30, mem_gb=2, cores=1)
     conda: "envs/qcstats.yml"
+    container: "docker://multiqc/multiqc:v1.20"
     shell:
         "multiqc"
         "   --no-megaqc-upload"
@@ -276,6 +285,7 @@ rule extract_unmapped:
         L("alignments/unmapped_reads/{aligner}~{ref}~{sample}.fastq.gz.log"),
     resources: **rule_resources(config, "extract_unmapped", runtime=120, mem_gb=1, cores=8)
     conda: "envs/align.yml"
+    container: "docker://ghcr.io/kdm9/align:latest"
     params:
         ziplevel=config.get("tool_settings", {}).get('ziplevel', 6),
     shell:

diff --git a/acanthophis/template/workflow/rules/base.rules b/acanthophis/template/workflow/rules/base.rules
@@ -3,7 +3,7 @@
 # please, if you find a bug, raise an issue on github so the fix gets shared
 # with everyone.
 #
-# Copyright 2016-2022 Kevin Murray/Gekkonid Consulting
+# Copyright 2016-2024 Kevin Murray/Gekkonid Consulting
 #
 # This Source Code Form is subject to the terms of the Mozilla Public License,
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
@@ -101,12 +101,11 @@ def parse_metadata(s2rl_file):
         if s2rl_file.endswith(".tsv"):
             dialect = "excel-tab"
         for run in csv.DictReader(fh, dialect=dialect):
-            if not run["library"] or run["library"].lower().startswith("blank"):
-                # Skip blanks
-                continue
             if run.get("include", "Y").upper() != "Y" or run.get("exclude", "N").upper() == "Y":
                 # Remove non-sequenced ones
                 continue
+            if run.get("exclude_why", ""):
+                continue
             meta.append({k.lower(): v for k, v in run.items()})
     return meta
 
@@ -119,6 +118,7 @@ def make_runlib2samp(rl2s_meta):
         samp = run["sample"]
         rl2s[rl] = samp
         s2rl[samp].append(rl)
+    print(f"Parsed {len(rl2s)} run-libs from {len(s2rl)} samples")
     return dict(rl2s), dict(s2rl)
 
 

diff --git a/acanthophis/template/workflow/rules/deepvariant.rules b/acanthophis/template/workflow/rules/deepvariant.rules
@@ -1,3 +1,13 @@
+# These rules are part of Acanthophis. See https://github.com/kdm9/Acanthophis.
+# This file *could* be modified, but then be careful when you update them. And
+# please, if you find a bug, raise an issue on github so the fix gets shared
+# with everyone.
+#
+# Copyright 2020-2024 Kevin Murray/Gekkonid Consulting
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at http://mozilla.org/MPL/2.0/.
 
 rule deepvariant_gvcf:
     input:
@@ -17,7 +27,7 @@ rule deepvariant_gvcf:
         model=lambda wc: config["tool_settings"]["varcall"].get("deepvariant_model", "WGS"),
         extra="",
     shadow: "shallow"
-    resources: **rule_resources(config, "deepvariant_gvcf", runtime=600, mem_gb=96, cores=48, disk_mb=400_000)
+    resources: **rule_resources(config, "deepvariant_gvcf", runtime=600, mem_gb=96, cores=32, disk_mb=400_000)
     shell:
         "( /opt/deepvariant/bin/run_deepvariant"
         "   --model_type={params.model}"
@@ -29,29 +39,6 @@ rule deepvariant_gvcf:
         "   --intermediate_results_dir=$TMPDIR"
         "   --num_shards={threads}"
         ") &> {log}"
-        #"( mkdir -p {params.tmp_dir}"
-        #" && dv_make_examples.py"
-        #"   --cores {threads}"
-        #"   --ref {input.ref}"
-        #"   --reads {input.bam}"
-        #"   --sample {wildcards.sample}"
-        #"   --examples {params.tmp_dir}"
-        #"   --logdir {params.tmp_dir}" 
-        #"   --gvcf {params.tmp_dir}"
-        #"   {params.extra}"
-        #" && dv_call_variants.py"
-        #"   --cores {threads}"
-        #"   --outfile {params.tmp_dir}/{wc.sample}.calls"
-        #"   --sample {wildcards.sample} "
-        #"   --examples {params.tmp_dir}"
-        #"   --model {params.model}"
-        #"&& dv_postprocess_variants.py "
-        #"   --ref {input.ref} "
-        #"   --gvcf_infile {params.tmp_dir}/{wc.sample}.gvcf.tfrecord@{threads}.gz"
-        #"   --gvcf_outfile {output.gvcf} "
-        #"   --infile {params.tmp_dir}/{wc.sample}.calls"
-        #"   --outfile {output.vcf}"
-        #") &> {log}"
 
 
 localrules: glnexus_fofn
@@ -79,10 +66,10 @@ rule glnexus_call:
         T("deepvariant/{aligner}~{ref}~{sampleset}.vcf.gz.log"),
     conda:
         "envs/glnexus.yml",
-    #container:
-    #    "docker://ghcr.io/dnanexus-rnd/glnexus:v1.4.1"
+    container:
+        "docker://ghcr.io/kdm9/glnexus-bcftools:latest"
     shadow: "shallow"
-    resources: **rule_resources(config, "glnexus_call", runtime=180, mem_gb=128, cores=128)
+    resources: **rule_resources(config, "glnexus_call", runtime=180, mem_gb=512, cores=128)
     shell:
         "( glnexus_cli"
         "   --config DeepVariant"

diff --git a/acanthophis/template/workflow/rules/denovo.rules b/acanthophis/template/workflow/rules/denovo.rules
@@ -3,7 +3,7 @@
 # please, if you find a bug, raise an issue on github so the fix gets shared
 # with everyone.
 #
-# Copyright 2016-2022 Kevin Murray/Gekkonid Consulting
+# Copyright 2016-2024 Kevin Murray/Gekkonid Consulting
 #
 # This Source Code Form is subject to the terms of the Mozilla Public License,
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
@@ -65,6 +65,7 @@ rule mash_sketch_set:
     log: L("mash/{set}~k{ksize}~s{sketchsize}.sketch.msh.log")
     resources: **rule_resources(config, "mash_sketch_set", runtime=2880, mem_gb=16, cores=48)
     conda: "envs/mash.yml"
+    container: "docker://ghcr.io/kdm9/mash:latest"
     shell:
         " mash sketch"
         "   -k {wildcards.ksize}"
@@ -84,6 +85,7 @@ rule mash_dist_set:
         L("mash/{set}~k{ksize}~s{sketchsize}.dist.log")
     resources: **rule_resources(config, "mash_dist_set", runtime=2880, mem_gb=16, cores=48)
     conda: "envs/mash.yml"
+    container: "docker://ghcr.io/kdm9/mash:latest"
     shell:
         "mash dist"
         "   -p {threads}"

diff --git a/acanthophis/template/workflow/rules/metagenome.rules b/acanthophis/template/workflow/rules/metagenome.rules
@@ -3,7 +3,7 @@
 # please, if you find a bug, raise an issue on github so the fix gets shared
 # with everyone.
 #
-# Copyright 2016-2022 Kevin Murray/Gekkonid Consulting
+# Copyright 2016-2024 Kevin Murray/Gekkonid Consulting
 #
 # This Source Code Form is subject to the terms of the Mozilla Public License,
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
@@ -53,12 +53,13 @@ rule diamondx_reads2db:
         db=lambda wc: R(config["data_paths"]["diamond"][wc.db]),
         reads=diamondx_reads2db_input,
     output:
-        tsv=P("metagenome/diamondx/{type}/{sample}~{db}.tsv.xz"),
+        tsv=P("metagenome/diamondx/{type}/{sample}~{db}.tsv.zstd"),
     log:
         L("metagenome/diamondx/{type}/{sample}~{db}.tsv.log"),
     benchmark: 
         L("metagenome/diamondx/{type}/{sample}~{db}.tsv.bench.csv"),
     conda: "envs/diamond.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-diamond:latest"
     resources: **rule_resources(config, "diamondx_reads2db", runtime=7200, mem_gb=72, disk_gb=5, cores=24)
     shell:
         'T=/tmp/holopipe_$RANDOM; mkdir -p $T; trap "rm -rf $T" INT EXIT TERM;'
@@ -73,7 +74,7 @@ rule diamondx_reads2db:
         "   --index-chunks 4"
         "   --tmpdir $T"
         "   --ignore-warnings"
-        "   --out >(xz -T{threads} >{output.tsv})"
+        "   --out >(zstd -T{threads} >{output.tsv})"
         " &> {log}"
 
 rule humann:
@@ -196,6 +197,7 @@ rule plass_quant_diamond:
         L("metagenome/plass/{type}/{samplelike}~renamed.faa.quant.blast.tsv.log"),
     benchmark: L("metagenome/plass/{type}/{samplelike}~renamed.faa.quant.blast.tsv.bench.csv"),
     conda: "envs/diamond.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-diamond:latest"
     resources: **rule_resources(config, "plass_quant_diamond", runtime=1440, mem_gb=90, disk_gb=16, cores=32)
     shell:
         "diamond blastx"
@@ -220,6 +222,7 @@ rule plass_diamond:
         L("metagenome/plass/{path}.{db}.blasttab.log")
     benchmark: P("metagenome/plass/{path}.{db}.blasttab.bench.csv")
     conda: "envs/diamond.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-diamond:latest"
     resources: **rule_resources(config, "plass_diamond", runtime=1440, mem_gb=90, disk_gb=16, cores=32)
     shell:
         "diamond blastp"
@@ -298,7 +301,7 @@ rule all_megahit:
 
 rule all_diamondx:
     input:
-        [P(f"metagenome/diamondx/{type}/{sample}~{db}.tsv.xz")
+        [P(f"metagenome/diamondx/{type}/{sample}~{db}.tsv.zstd")
            for sampleset in config["samplesets"]
            for sample in config["SAMPLESETS"][sampleset]
            for type in config["samplesets"][sampleset].get("diamondx", {}).get("types", [])

diff --git a/acanthophis/template/workflow/rules/reads.rules b/acanthophis/template/workflow/rules/reads.rules
@@ -3,7 +3,7 @@
 # please, if you find a bug, raise an issue on github so the fix gets shared
 # with everyone.
 #
-# Copyright 2016-2022 Kevin Murray/Gekkonid Consulting
+# Copyright 2016-2024 Kevin Murray/Gekkonid Consulting
 #
 # This Source Code Form is subject to the terms of the Mozilla Public License,
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
@@ -33,6 +33,7 @@ rule qcreads_paired_il:
         maxqualval=lambda wc: _qcparam(wc, "maxqualval"),
         ziplevel=config.get("tool_settings", {}).get('ziplevel', 6),
     conda: "envs/reads.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-qc:latest"
     shell:
         "( AdapterRemoval"
         "   --file1 {input.reads}"
@@ -68,6 +69,7 @@ rule qcreads_paired_r12:
         maxqualval=lambda wc: _qcparam(wc, "maxqualval"),
         ziplevel=config.get("tool_settings", {}).get('ziplevel', 6),
     conda: "envs/reads.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-qc:latest"
     shell:
         "( AdapterRemoval"
         "   --file1 {input.r1}"
@@ -104,6 +106,7 @@ rule qcreads_se:
         maxqualval=lambda wc: _qcparam(wc, "maxqualval"),
         ziplevel=config.get("tool_settings", {}).get('ziplevel', 6),
     conda: "envs/reads.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-qc:latest"
     shell:
         "( AdapterRemoval"
         "   --file1 {input.se}"
@@ -133,6 +136,7 @@ rule merge_qcd_reads:
         L("reads/runs/{run}~{lib}.fastq.gz.log"),
     resources: **rule_resources(config, "merge_qcd_reads", runtime=30, mem_gb=1, disk_gb=1, cores=1)
     conda: "envs/reads.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-qc:latest"
     shell:
         "(cat {input} >{output} ) >{log} 2>&1"
 
@@ -145,6 +149,7 @@ rule read_count_librun_indiv:
     log:
         L("stats/reads/readnum_librun/{run}~{lib}.tsv.log"),
     conda: "envs/reads.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-qc:latest"
     resources: **rule_resources(config, "read_count_librun_indiv", runtime=10, mem_gb=1, disk_gb=1)
     shell:
         "( seqhax stats"
@@ -206,6 +211,7 @@ rule split_pair_sample:
         L("reads/samples/{sample}_split.log"),
     resources: **rule_resources(config, "split_pair_sample", runtime=30, mem_gb=1, disk_gb=1, cores=8)
     conda: "envs/reads.yml"
+    container: "docker://ghcr.io/kdm9/acanthophis-qc:latest"
     params:
         ziplevel=config.get("tool_settings", {}).get('ziplevel', 6),
     shell:
@@ -236,6 +242,7 @@ rule fastqc_preqc:
         fqczip=P("stats/fastqc/preqc/{run}~{lib}_fastqc.zip"),
     log: L("stats/fastqc/preqc/{run}~{lib}_fastqc.log"),
     resources: **rule_resources(config, "fastqc_preqc", runtime=30, mem_gb=1, cores=1)
+    container: "docker://ghcr.io/kdm9/acanthophis-qc:latest"
     conda: "envs/qcstats.yml"
     shell:
         "(T=$(mktemp -d);"
@@ -256,6 +263,7 @@ rule fastqc_postqc:
         fqczip=P("stats/fastqc/postqc/{run}~{lib}_fastqc.zip"),
     log: L("stats/fastqc/postqc/{run}~{lib}_fastqc.zip.log"),
     resources: **rule_resources(config, "fastqc_postqc", runtime=30, mem_gb=1, cores=1)
+    container: "docker://ghcr.io/kdm9/acanthophis-qc:latest"
     conda: "envs/qcstats.yml"
     shell:
         "set -x; (T=$(mktemp -d);"
@@ -279,6 +287,7 @@ rule multiqc_fastqc:
         log=L("stats/multiqc/reads-{prepost}~{sampleset}_multiqc.log"),
     resources: **rule_resources(config, "multiqc_fastqc", runtime=30, mem_gb=2)
     conda: "envs/qcstats.yml"
+    container: "docker://multiqc/multiqc:v1.20"
     shell:
         "multiqc"
         "   --no-megaqc-upload"