nf-core · heringerp · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -174,3 +174,25 @@ jobs:
       - name: Run pipeline with test data and wfmash_only parameters
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }}
+
+  metric_parameter_suffixes:
+    name: Run pipeline with metric parameter suffixes
+    # Only run on push if this is the nf-core dev branch (merged PRs)
+    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/pangenome') }}"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        parameters:
+          - "--wfmash_sequence_length 8k --wfmash_block_length 38K --seqwish_transclose_batch 9M"
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v3
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "23.04.0"
+
+      - name: Run pipeline with test data and wfmash_only parameters
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }}
diff --git a/conf/modules.config b/conf/modules.config
@@ -47,6 +47,42 @@ def generate_smoothxg_poa_params_cmd() {
 
 def smoothxg_poa_params_cmd = generate_smoothxg_poa_params_cmd()
 
+def parse_int(int plain_integer) {
+    return plain_integer
+}
+
+def parse_int(String integer_with_suffix) {
+    def parsed_integer
+    if (integer_with_suffix.isInteger()) {
+        parsed_integer = integer_with_suffix.toInteger()
+    } else {
+        def integer_part = integer_with_suffix.substring(0, integer_with_suffix.length() - 1).toInteger()
+        def suffix_part = integer_with_suffix.substring(integer_with_suffix.length() - 1, integer_with_suffix.length())
+
+        switch(suffix_part) {
+            case "k":
+            case "K":
+                parsed_integer = integer_part * 1000
+                break;
+            case "m":
+            case "M":
+                parsed_integer = integer_part * 1000000
+                break;
+            case "g":
+            case "G":
+                parsed_integer = integer_part * 1000000000
+                break;
+            case "t":
+            case "T":
+                parsed_integer = integer_part * 1000000000000
+                break;
+            default:
+                break;
+        }
+    }
+    return parsed_integer
+}
+
 process {
 
     publishDir = [
@@ -67,12 +103,12 @@ process {
         ext.args = {
             [
                 "-n ${params.n_haplotypes - 1}",
-                "-s ${params.wfmash_segment_length}",
+                "-s ${parse_int(params.wfmash_segment_length)}",
                 "-p ${params.wfmash_map_pct_id}",
                 params.wfmash_merge_segments   ? "-M"                                : "",
                 params.wfmash_exclude_delim    ? "-Y ${params.wfmash_exclude_delim}" : "-X",
                 params.wfmash_no_splits        ? "-N"                                : "",
-                params.wfmash_block_length     ? "-l ${params.wfmash_block_length}"  : "-l ${params.wfmash_segment_length * 5}",
+                params.wfmash_block_length     ? "-l ${parse_int(params.wfmash_block_length)}"  : "-l ${parse_int(params.wfmash_segment_length) * 5}",
                 "-k ${params.wfmash_mash_kmer}",
                 "-H ${params.wfmash_mash_kmer_thres}",
                 "${wfmash_sparse_map_cmd}",
@@ -90,12 +126,12 @@ process {
         ext.args = {
             [
                 "-n ${params.n_haplotypes - 1}",
-                "-s ${params.wfmash_segment_length}",
+                "-s ${parse_int(params.wfmash_segment_length)}",
                 "-p ${params.wfmash_map_pct_id}",
                 params.wfmash_merge_segments   ? "-M"                                : "",
                 params.wfmash_exclude_delim    ? "-Y ${params.wfmash_exclude_delim}" : "-X",
                 params.wfmash_no_splits        ? "-N"                                : "",
-                params.wfmash_block_length     ? "-l ${params.wfmash_block_length}"  : "-l ${params.wfmash_segment_length * 5}",
+                params.wfmash_block_length     ? "-l ${parse_int(params.wfmash_block_length)}"  : "-l ${parse_int(params.wfmash_segment_length) * 5}",
                 "-k ${params.wfmash_mash_kmer}",
                 "-H ${params.wfmash_mash_kmer_thres}",
                 "${wfmash_sparse_map_cmd}",
@@ -114,12 +150,12 @@ process {
         ext.args = {
             [
                 "-n ${params.n_haplotypes - 1}",
-                "-s ${params.wfmash_segment_length}",
+                "-s ${parse_int(params.wfmash_segment_length)}",
                 "-p ${params.wfmash_map_pct_id}",
                 params.wfmash_merge_segments   ? "-M"                                : "",
                 params.wfmash_exclude_delim    ? "-Y ${params.wfmash_exclude_delim}" : "-X",
                 params.wfmash_no_splits        ? "-N"                                : "",
-                params.wfmash_block_length     ? "-l ${params.wfmash_block_length}"  : "-l ${params.wfmash_segment_length * 5}",
+                params.wfmash_block_length     ? "-l ${parse_int(params.wfmash_block_length)}"  : "-l ${parse_int(params.wfmash_segment_length) * 5}",
                 "-k ${params.wfmash_mash_kmer}",
                 "-H ${params.wfmash_mash_kmer_thres}",
                 "${wfmash_sparse_map_cmd}",
@@ -147,12 +183,12 @@ process {
         ext.args = {
             [
                 "-n ${params.n_haplotypes - 1}",
-                "-s ${params.wfmash_segment_length}",
+                "-s ${parse_int(params.wfmash_segment_length)}",
                 "-p ${params.wfmash_map_pct_id}",
                 params.wfmash_merge_segments   ? "-M"                                : "",
                 params.wfmash_exclude_delim    ? "-Y ${params.wfmash_exclude_delim}" : "-X",
                 params.wfmash_no_splits        ? "-N"                                : "",
-                params.wfmash_block_length     ? "-l ${params.wfmash_block_length}"  : "-l ${params.wfmash_segment_length * 5}",
+                params.wfmash_block_length     ? "-l ${parse_int(params.wfmash_block_length)}"  : "-l ${parse_int(params.wfmash_segment_length) * 5}",
                 "-k ${params.wfmash_mash_kmer}",
                 "-H ${params.wfmash_mash_kmer_thres}",
                 "${wfmash_sparse_map_cmd}",
@@ -173,7 +209,7 @@ process {
             [
                 "-k ${params.seqwish_min_match_length}",
                 "-f ${params.seqwish_sparse_factor}",
-                "-B ${params.seqwish_transclose_batch}",
+                "-B ${parse_int(params.seqwish_transclose_batch)}",
                 "-P",
                 params.seqwish_temp_dir        ? "--temp-dir ${params.seqwish_temp_dir}" : ""
             ].join(" ").trim()

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -54,21 +54,22 @@
             "properties": {
                 "wfmash_map_pct_id": {
                     "type": "number",
-                    "default": 90.0,
+                    "default": 90,
                     "description": "Percent identity in the wfmash mashmap step.",
                     "help_text": "Use `mash dist` or `mash triangle` to explore the typical level of divergence between the sequences in your input (see https://pggb.readthedocs.io/en/latest/rst/tutorials/divergence_estimation.html#divergence-estimation for more information). Convert this to an approximate percent identity and provide it as --wfmash_map_pct_id <PCT>. A list of examples can be found at https://github.com/pangenome/pggb#example-builds-for-diverse-species."
                 },
                 "wfmash_segment_length": {
-                    "type": "integer",
-                    "default": 5000,
+                    "type": "string",
+                    "default": "5000",
                     "description": "Segment length for mapping.",
-                    "help_text": "Crucially, --wfmash_segment_length provides a kind of minimum alignment length filter. The `mashmap3` step in `wfmash` will only consider segments of this size. For small pangenome graphs, or where there are few repeats, --wfmash_segment_length can be set low (for example 500 when building a MHC pangenome graph). However, for larger contexts, with repeats, it can be very important to set this high (for instance 50k in the case of human genomes). A long segment length ensures that we represent long collinear regions of the input sequences in the structure of the graph. In general, this should at least be larger than transposons and other common repeats in your pangenome. A list of examples can be found at https://github.com/pangenome/pggb#example-builds-for-diverse-species."
+                    "help_text": "Crucially, --wfmash_segment_length provides a kind of minimum alignment length filter. The `mashmap3` step in `wfmash` will only consider segments of this size. For small pangenome graphs, or where there are few repeats, --wfmash_segment_length can be set low (for example 500 when building a MHC pangenome graph). However, for larger contexts, with repeats, it can be very important to set this high (for instance 50k in the case of human genomes). A long segment length ensures that we represent long collinear regions of the input sequences in the structure of the graph. In general, this should at least be larger than transposons and other common repeats in your pangenome. A list of examples can be found at https://github.com/pangenome/pggb#example-builds-for-diverse-species.",
+                    "pattern": "^([1-9]\\d*[kKmMgGtT]?|0)$"
                 },
                 "wfmash_block_length": {
                     "type": "string",
-                    "default": "wfmash_segment_length * 5",
                     "description": "Minimum block length filter for mapping.",
-                    "help_text": "By default, wfmash only keeps mappings with at least 5 times the size of a segment. This can be adjusted with --wfmash_block_length <BLOCK_LENGTH>."
+                    "help_text": "By default, wfmash only keeps mappings with at least 5 times the size of a segment. This can be adjusted with --wfmash_block_length <BLOCK_LENGTH>.",
+                    "pattern": "^([1-9]\\d*[kKmMgGtT]?|0)$"
                 },
                 "wfmash_mash_kmer": {
                     "type": "integer",
@@ -81,7 +82,7 @@
                     "description": "Ignore the top % most-frequent kmers."
                 },
                 "wfmash_sparse_map": {
-                    "default": 1.0,
+                    "default": "1.0",
                     "description": "Keep this fraction of mappings (\"auto\" for giant component heuristic).",
                     "type": "string",
                     "pattern": "(auto|[01]\\.\\d+)"
@@ -130,10 +131,11 @@
                     "help_text": "Graph induction with seqwish often works better when we filter very short matches out of the input alignments. In practice, these often occur in regions of low alignment quality, which are typical of areas with large INDELs and structural variations in the wfmash alignments. This underalignment is then resolved in the smoothxg step. Removing short matches can simplify the graph and remove spurious relationships caused by short repeated homologies.\nA setting of --seqwish_min_match_length 47 is optimal for around 5% divergence, and we suggest lowering it for higher divergence and increasing it for lower divergence. Values up to --seqwish_min_match_length 311 work well for human haplotypes. In effect, setting --seqwish_min_match_length to N means that we can tolerate a local pairwise difference rate of no more than 1/N. Thus, INDELs which may be represented by complex series of edit operations will be opened into bubbles in the induced graph, and alignment regions with very low identity will be ignored. Using affine-gapped alignment (such as with minimap2) may reduce the impact of this step by representing large indels more precisely in the input alignments. However, it remains important due to local inconsistency in alignments in low-complexity sequence."
                 },
                 "seqwish_transclose_batch": {
-                    "type": "integer",
-                    "default": 10000000,
+                    "type": "string",
+                    "default": "10000000",
                     "description": "Number of base pairs to use for transitive closure batch.",
-                    "help_text": "If you run out of memory during the seqwish step, you can lower this value. It will take longer, but it will use less memory."
+                    "help_text": "If you run out of memory during the seqwish step, you can lower this value. It will take longer, but it will use less memory.",
+                    "pattern": "^([1-9]\\d*[kKmMgGtT]?|0)$"
                 },
                 "seqwish_sparse_factor": {
                     "type": "number",