Merge pull request #35 from metagenlab/readproc

Added error free BAMs, optimized read processing and functions
metagenlab · Nov 1, 2024 · 6dd8f2d · 6dd8f2d
2 parents 89471d5 + 568e080
commit 6dd8f2d
Show file tree

Hide file tree

Showing 22 changed files with 232 additions and 296 deletions.
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -46,6 +46,9 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Setup apptainer
+        uses: eWaterCycle/[email protected]
+
       - name: Setup MeSS environment
         uses: conda-incubator/setup-miniconda@v3
         with:
@@ -54,18 +57,6 @@ jobs:
           python-version: ${{ matrix.python-version }}
           auto-update-conda: true
 
-      - name: Setup apt dependencies
-        run: |
-          sudo add-apt-repository -y ppa:apptainer/ppa
-          sudo apt-get update
-          sudo apt install -y squashfuse fuse2fs gocryptfs apptainer
-
-      - name: Disable apparmor namespace restrictions for apptainer
-        run: |
-          sudo sh -c 'echo kernel.apparmor_restrict_unprivileged_userns=0 \
-              >/etc/sysctl.d/90-disable-userns-restrictions.conf'
-          sudo sysctl -p /etc/sysctl.d/90-disable-userns-restrictions.conf
-
       - name: Install MeSS and pytest-cov
         run: |
           pip install -e .

diff --git a/mess/__main__.py b/mess/__main__.py
@@ -64,7 +64,13 @@
     },
     {
         "name": "art_illumina options",
-        "options": ["--custom-err", "--paired", "--frag-len", "--frag-sd"],
+        "options": [
+            "--custom-err",
+            "--errfree",
+            "--paired",
+            "--frag-len",
+            "--frag-sd",
+        ],
     },
     {
         "name": "pbsim3 options",

diff --git a/mess/config/config.yaml b/mess/config/config.yaml
@@ -11,7 +11,8 @@ args:
   configfile: 
   custom_err: 
   dist: 
-  error: 
+  error:
+  errfree: 
   fasta: 
   frag_len: 
   frag_sd: 

diff --git a/mess/test_data/minimal_test.tsv b/mess/test_data/minimal_test.tsv
@@ -1,3 +1,3 @@
 taxon	nb	cov_sim	sample
 staphylococcus_aureus	1	0.1	sample1
-1290	1	0.1	sample2
+1290	1	0.1	sample2
diff --git a/mess/util.py b/mess/util.py
@@ -291,6 +291,11 @@ def sim_options(func):
             type=str,
             default=None,
         ),
+        click.option(
+            "--errfree",
+            help="Generate error free alignments with art_illumina",
+            is_flag=True,
+        ),
         click.option(
             "--replicates",
             help="Number of replicates per sample",

diff --git a/mess/workflow/Snakefile b/mess/workflow/Snakefile
@@ -112,6 +112,7 @@ include: os.path.join("rules", "processing", "fastas.smk")
 CUSTOM_ERR = config.args.custom_err
 ERROR = config.args.error
 BAM = config.args.bam
+ERRFREE = config.args.errfree
 MIN_LEN = config.args.min_len
 MAX_LEN = config.args.max_len
 SD_LEN = config.args.sd_len
@@ -131,6 +132,7 @@ else:
 
 
 # reads post-processsing options
+random.seed(SEED)
 SHUFFLE = dict(zip(SAMPLES, random.sample(range(1, 100000), len(SAMPLES))))
 SKIP_SHUFFLE = config.args.skip_shuffle
 RANKS = config.args.ranks

diff --git a/mess/workflow/envs/conda/assembly_finder.yml b/mess/workflow/envs/conda/assembly_finder.yml
@@ -4,8 +4,8 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - assembly_finder =0.7.6
-  - ncbi-datasets-cli =16.26.2
+  - assembly_finder =0.8.0
+  - ncbi-datasets-cli =16.31.0
   - taxonkit =0.17.0
   - csvtk =0.30.0
   - rsync =3.3.0

diff --git a/mess/workflow/envs/conda/samtools.yml b/mess/workflow/envs/conda/samtools.yml
@@ -0,0 +1,7 @@
+name: samtools
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - samtools =1.21
diff --git a/mess/workflow/envs/containers.yml b/mess/workflow/envs/containers.yml
@@ -1,9 +1,10 @@
 art: docker://quay.io/biocontainers/art:2016.06.05--heacdb12_11
-assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.7.7
+assembly_finder: docker://ghcr.io/metagenlab/assembly_finder:v0.8.0
 bioconvert: docker://quay.io/biocontainers/bioconvert:1.1.1--pyhdfd78af_0
 curl: docker://quay.io/biocontainers/curl:7.80.0
 pigz: docker://quay.io/biocontainers/pigz:2.8
 pbccs: docker://quay.io/biocontainers/pbccs:6.4.0--h9ee0642_0
 pbsim3: docker://quay.io/biocontainers/pbsim3:3.0.4--h4ac6f70_0
 seqkit: docker://quay.io/biocontainers/seqkit:2.8.2--h9ee0642_0
 taxonkit: docker://quay.io/biocontainers/taxonkit:0.17.0--h9ee0642_1
+samtools: docker://quay.io/biocontainers/samtools:1.21--h50ea8bc_0
diff --git a/mess/workflow/rules/download/assembly_finder.smk b/mess/workflow/rules/download/assembly_finder.smk
@@ -20,7 +20,7 @@ rule get_unique_entries:
             )
 
 
-af_args = ""
+af_args = "--no-use-conda "
 if TAXON:
     af_args += "--taxon "
     if LIMIT:
@@ -59,14 +59,11 @@ checkpoint download_assemblies:
         tsv=os.path.join(dir.out.base, "uniq_entries.tsv"),
     output:
         asm=os.path.join(dir.out.base, "assembly_finder/assembly_summary.tsv"),
-        seq=os.path.join(dir.out.base, "assembly_finder/sequence_report.tsv"),
         tax=os.path.join(dir.out.base, "assembly_finder/taxonomy.tsv"),
     params:
         args=af_args,
         taxonkit=TAXONKIT,
         out=os.path.join(dir.out.base, "assembly_finder"),
-    benchmark:
-        os.path.join(dir.out.bench, "assembly_finder.txt")
     log:
         os.path.join(dir.out.logs, "assembly_finder.log"),
     resources:
@@ -84,6 +81,5 @@ checkpoint download_assemblies:
         --taxonkit {params.taxonkit} \\
         --threads {threads} \\
         {params.args} \\
-        --no-use-conda \\
         -o {params.out} 2> {log}
         """
diff --git a/mess/workflow/rules/preflight/directories.smk b/mess/workflow/rules/preflight/directories.smk
@@ -34,6 +34,7 @@ dir.out.short = os.path.join(dir.out.processing, "short")
 dir.out.long = os.path.join(dir.out.processing, "long")
 dir.out.fastq = os.path.join(dir.out.base, "fastq")
 dir.out.bam = os.path.join(dir.out.base, "bam")
+dir.out.ef = os.path.join(dir.out.bam, "error-free")
 dir.out.tax = os.path.join(dir.out.base, "tax")
 
 

diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk
@@ -10,34 +10,39 @@ import random
 wildcard_constraints:
     sample="[^/]+",
     contig="[^/]+",
+    fasta="[^/]+",
 
 
 def list_reads(wildcards):
+    fastqs = "{sample}.fq.gz"
+    args = {"sample": SAMPLES}
+
     if PAIRED:
-        reads = expand(
-            os.path.join(dir.out.fastq, "{sample}_R{p}.fq.gz"),
-            sample=SAMPLES,
-            p=PAIRS,
-        )
-    else:
-        reads = expand(
-            os.path.join(dir.out.fastq, "{sample}.fq.gz"),
-            sample=SAMPLES,
-        )
+        fastqs = "{sample}_R{p}.fq.gz"
+        args.update({"p": PAIRS})
+    reads = collect(os.path.join(dir.out.fastq, fastqs), **args)
 
     if BAM:
-        bams = expand(
+        bams = collect(
             os.path.join(dir.out.bam, "{sample}.{bam}"),
             sample=SAMPLES,
             bam=["bam", "bam.bai"],
         )
-        tax = expand(
+        tax = collect(
             os.path.join(dir.out.tax, "{sample}_{abundance}.txt"),
             sample=SAMPLES,
             abundance=["seq", "tax"],
         )
         reads = reads + bams + tax
 
+    if ERRFREE:
+        bams_ef = expand(
+            os.path.join(dir.out.ef, "{sample}.{bam}"),
+            sample=SAMPLES,
+            bam=["bam", "bam.bai"],
+        )
+        reads = reads + bams_ef
+
     return reads
 
 
@@ -67,32 +72,43 @@ def parse_samples(indir, replicates):
 fasta_cache = {}
 
 
-def fasta_input(wildcards):
-    table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0]
+def get_fasta_table(wildcards):
+    fa_table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0]
+    if fa_table not in fasta_cache:
+        fa_df = pd.read_csv(fa_table, sep="\t", index_col="fasta")
+        fasta_cache[fa_table] = fa_df
+    fa_df = fasta_cache[fa_table]
+    return fa_df
 
-    df = pd.read_csv(table, sep="\t", index_col="fasta")
+
+def fasta_input(wildcards):
+    df = get_fasta_table(wildcards)
     try:
         return df.loc[wildcards.fasta]["path"].drop_duplicates()
     except AttributeError:
         return df.loc[wildcards.fasta]["path"]
-    # some samples use the same genome path, drop duplicates to avoid duplicate paths when processing fasta
 
 
 def list_fastas(wildcards):
-    table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0]
-    if table not in fasta_cache:
-        df = pd.read_csv(table, sep="\t")
-        fasta_cache[table] = df
-    df = fasta_cache[table]
-    fastas = list(set(df["fasta"]))
-    return expand(os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=fastas)
+    df = get_fasta_table(wildcards)
+    return expand(
+        os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=list(set(df.index))
+    )
 
 
 table_cache = {}
 
 
-def get_value(value, wildcards):
+def get_cov_table(wildcards, key, idx_col):
+    cov_table = checkpoints.split_contigs.get(**wildcards).output[0]
+    if cov_table not in table_cache:
+        cov_df = pd.read_csv(cov_table, sep="\t", index_col=idx_col).sort_index()
+        table_cache[key] = cov_df
+    cov_df = table_cache[key]
+    return cov_df
+
 
+def get_value(value, wildcards):
     vals = (
         f"{wildcards.sample}",
         f"{wildcards.fasta}",
@@ -103,17 +119,8 @@ def get_value(value, wildcards):
     if CIRCULAR:
         idx_col += ["n"]
         vals += (int(wildcards.n),)
-
-    table = checkpoints.split_contigs.get(**wildcards).output[0]
-    if table not in table_cache:
-        df = pd.read_csv(
-            table,
-            sep="\t",
-            index_col=idx_col,
-        ).sort_index()
-        table_cache[table] = df
-    df = table_cache[table]
-    return df.loc[vals, value]
+    val_df = get_cov_table(wildcards, "values", idx_col)
+    return val_df.loc[vals, value]
 
 
 def get_asm_summary(wildcards):
@@ -128,10 +135,6 @@ def get_asm_summary(wildcards):
     return table
 
 
-def get_cov_table(wildcards):
-    return checkpoints.split_contigs.get(**wildcards).output[0]
-
-
 def is_circular():
     if os.path.isfile(INPUT):
         files = [INPUT]
@@ -144,78 +147,19 @@ def is_circular():
         return False
 
 
-def aggregate(wildcards, outdir, level, ext):
-    table = checkpoints.split_contigs.get(**wildcards).output[0]
-    df = pd.read_csv(table, sep="\t", index_col=["samplename", "fasta"]).sort_index()
-    if level == "contig":
-        contigs = list(
-            df.loc[(wildcards.sample, wildcards.fasta)]["contig"].drop_duplicates()
-        )
-        if "rotate" in df.columns:
-            rotates = int(
-                df.loc[(wildcards.sample, wildcards.fasta)]["rotate"]
-                .drop_duplicates()
-                .values
-            )
-
-        if PAIRED and ext != "bam":
-            if "rotate" in df.columns:
-                return expand(
-                    os.path.join(
-                        outdir, "{sample}", "{fasta}", "{contig}_{n}{p}.{ext}"
-                    ),
-                    sample=wildcards.sample,
-                    fasta=wildcards.fasta,
-                    n=list(range(1, rotates + 1)),
-                    p=wildcards.p,
-                    contig=contigs,
-                    ext=ext,
-                )
-            else:
-                return expand(
-                    os.path.join(outdir, "{sample}", "{fasta}", "{contig}{p}.{ext}"),
-                    sample=wildcards.sample,
-                    fasta=wildcards.fasta,
-                    p=wildcards.p,
-                    contig=contigs,
-                    ext=ext,
-                )
-
-        else:
-            if "rotate" in df.columns:
-                return expand(
-                    os.path.join(outdir, "{sample}", "{fasta}", "{contig}_{n}.{ext}"),
-                    sample=wildcards.sample,
-                    fasta=wildcards.fasta,
-                    contig=contigs,
-                    n=list(range(1, rotates + 1)),
-                    ext=ext,
-                )
-            else:
-                return expand(
-                    os.path.join(outdir, "{sample}", "{fasta}", "{contig}.{ext}"),
-                    sample=wildcards.sample,
-                    fasta=wildcards.fasta,
-                    contig=contigs,
-                    ext=ext,
-                )
-    if level == "fasta":
-        fastas = list(set(df.loc[wildcards.sample].index))
+def aggregate(wildcards, outdir, ext):
+    df = get_cov_table(wildcards, "aggregate", ["samplename"])
+    files = []
+    for row in df.loc[[wildcards.sample]].itertuples():
+        prefix = f"{row.contig}"
+        if CIRCULAR:
+            prefix += f"_{row.n}"
         if PAIRED and ext != "bam":
-            return expand(
-                os.path.join(outdir, "{sample}", "{fasta}{p}.{ext}"),
-                sample=wildcards.sample,
-                fasta=fastas,
-                p=wildcards.p,
-                ext=ext,
-            )
-        else:
-            return expand(
-                os.path.join(outdir, "{sample}", "{fasta}.{ext}"),
-                sample=wildcards.sample,
-                fasta=fastas,
-                ext=ext,
-            )
+            prefix += f"{wildcards.p}"
+        files.append(
+            os.path.join(outdir, wildcards.sample, row.fasta, f"{prefix}.{ext}")
+        )
+    return files
 
 
 def get_header(fa):

diff --git a/mess/workflow/rules/preflight/targets_download.smk b/mess/workflow/rules/preflight/targets_download.smk
@@ -2,10 +2,8 @@
 All target download files are declared here
 """
 
-
 TargetDownloads = [
     os.path.join(dir.out.base, "uniq_entries.tsv"),
     os.path.join(dir.out.base, "assembly_finder/assembly_summary.tsv"),
-    os.path.join(dir.out.base, "assembly_finder/sequence_report.tsv"),
     os.path.join(dir.out.base, "assembly_finder/taxonomy.tsv"),
 ]