nf-core · rodtheo · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/modules/nf-core/merfin/hist/environment.yml b/modules/nf-core/merfin/hist/environment.yml
@@ -0,0 +1,9 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+name: "merfin_hist"
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - "bioconda::merfin=1.0"
diff --git a/modules/nf-core/merfin/hist/main.nf b/modules/nf-core/merfin/hist/main.nf
@@ -0,0 +1,62 @@
+process MERFIN_HIST {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/merfin:1.0--h4ac6f70_2':
+        'biocontainers/merfin:1.0--h4ac6f70_2' }"
+
+    input:
+    tuple val(meta), path(fasta_assembly)   // Required Input -sequence files can be FASTA or FASTQ; uncompressed, gz compressed.
+    tuple val(meta1), path(meryl_db_reads)  // Required readmers (raw reads meryl db). As it comes from another tool, it might be relevant to mantain the meta.
+    path(lookup_table)                      // Optional input vector of probabilities (obtained by genomescope2 with parameter --fitted_hist).
+    path(seqmers)                           // Optional input for pre-built sequence meryl db (-seqmers).
+    val(peak)                               // Required input to hard set copy 1 and infer multiplicity to copy number.
+
+    output:
+    tuple val(meta), path("*.hist")     , emit: hist
+    path("*.hist.stderr.log")           , emit: log_stderr
+    path "versions.yml"                 , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args                    = task.ext.args ?: ''
+    def prefix                  = task.ext.prefix ?: "${meta.id}"
+    def optional_lookup_table   = lookup_table ? "-prob ${lookup_table}" : ""
+    def optional_seqmers        = seqmers ? "-seqmers ${seqmers}" : ""
+    """
+    merfin -hist \\
+        -threads $task.cpus \\
+        $args \\
+        -sequence $fasta_assembly \\
+        -readmers $meryl_db_reads \\
+        -peak $peak \\
+        $optional_lookup_table \\
+        $optional_seqmers \\
+        -output ${prefix}.hist \\
+        2> >( tee ${prefix}.hist.stderr.log >&2 )
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        merfin: \$( merfin --version |& sed 's/merfin //' )
+    END_VERSIONS
+    """
+
+    stub:
+    def args                    = task.ext.args ?: ''
+    def prefix                  = task.ext.prefix ?: "${meta.id}"
+    def optional_lookup_table   = lookup_table ? "-prob ${lookup_table}" : ""
+    def optional_seqmers        = seqmers ? "-seqmers ${seqmers}" : ""
+    """
+    touch ${prefix}.hist
+    touch ${prefix}.hist.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        merfin: \$( merfin --version |& sed 's/merfin //' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/merfin/hist/meta.yml b/modules/nf-core/merfin/hist/meta.yml
@@ -0,0 +1,80 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "merfin_hist"
+description: Compare k-mer frequency in reads and assembly to devise the metrics K* and QV*
+keywords:
+  - assembly
+  - evaluation
+  - quality
+  - completeness
+tools:
+  - "merfin":
+      description: "Merfin (k-mer based finishing tool) is a suite of subtools to variant filtering, assembly evaluation and polishing via k-mer validation. The subtool -hist estimates the QV (quality value of [Merqury](https://github.com/marbl/merqury)) for each scaffold/contig and genome-wide averages. In addition, Merfin produces a QV* estimate, which accounts also for kmers that are seen in excess with respect to their expected multiplicity predicted from the reads."
+      homepage: "https://github.com/arangrhie/merfin"
+      documentation: "https://github.com/arangrhie/merfin/wiki/Best-practices-for-Merfin"
+      doi: "10.1038/s41592-022-01445-y"
+      licence: ["Apache-2.0"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'sample1', single_end:false ]`
+
+  - fasta_assembly:
+      type: file
+      description: Genome assembly in FASTA; uncompressed, gz compressed [REQUIRED]
+      pattern: "*.{fasta, fasta.gz}"
+
+  - meta1:
+      type: map
+      description: |
+        Groovy Map containing sample read information
+        e.g. `[ id:'sample1', single_end:false ]`
+
+  - meryl_db_reads:
+      type: file
+      description: K-mer database produced from raw reads using Meryl [REQUIRED]
+      pattern: "*.{meryl_db}"
+
+  - lookup_table:
+      type: file
+      description: Input vector of k-mer probabilities (obtained by genomescope2 with parameter --fitted_hist) [OPTIONAL]
+      pattern: "lookup_table.txt"
+
+  - seqmers:
+      type: file
+      description: Input for pre-built sequence meryl db. By default, the sequence meryl db will be generated from the input genome assembly [OPTIONAL]
+      pattern: "*.{meryl_db}"
+
+  - peak:
+      type: float
+      description: Input to hard set copy 1 and infer multiplicity to copy number. Can be calculated using genomescope2 [REQUIRED]
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'sample1', single_end:false ]`
+
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+  - hist:
+      type: file
+      description: The generated 0-centered k* histogram for sequences in <fasta_assembly.fasta>. Positive k* values are expected collapsed copies. Negative k* values are expected expanded  copies. Closer to 0 means the expected and found k-mers are well balenced, 1:1.
+      pattern: "*.{hist}"
+
+  - log_stderr:
+      type: file
+      description: Log (stderr) of hist tool execution. The QV and QV* metrics are reported at the end.
+      pattern: "*.{hist.stderr.log}"
+
+authors:
+  - "@rodtheo"
+maintainers:
+  - "@rodtheo"